Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .env +1 -0
- Dockerfile +89 -0
- README.md +339 -7
- __pycache__/api.cpython-311.pyc +0 -0
- __pycache__/config.cpython-310.pyc +0 -0
- __pycache__/config.cpython-311.pyc +0 -0
- __pycache__/crawler.cpython-310.pyc +0 -0
- __pycache__/crawler.cpython-311.pyc +0 -0
- __pycache__/dns_resolver.cpython-310.pyc +0 -0
- __pycache__/dns_resolver.cpython-311.pyc +0 -0
- __pycache__/downloader.cpython-310.pyc +0 -0
- __pycache__/downloader.cpython-311.pyc +0 -0
- __pycache__/frontier.cpython-310.pyc +0 -0
- __pycache__/frontier.cpython-311.pyc +0 -0
- __pycache__/local_config.cpython-310.pyc +0 -0
- __pycache__/local_config.cpython-311.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- __pycache__/models.cpython-311.pyc +0 -0
- __pycache__/mongo_cleanup.cpython-310.pyc +0 -0
- __pycache__/mongo_cleanup.cpython-311.pyc +0 -0
- __pycache__/parser.cpython-310.pyc +0 -0
- __pycache__/parser.cpython-311.pyc +0 -0
- __pycache__/robots.cpython-310.pyc +0 -0
- __pycache__/robots.cpython-311.pyc +0 -0
- __pycache__/run_crawler.cpython-310.pyc +0 -0
- api.py +588 -0
- cleanup.py +130 -0
- cleanup_all.sh +47 -0
- config.py +96 -0
- crawl.py +370 -0
- crawler.log +0 -0
- crawler.py +908 -0
- deduplication.py +422 -0
- dns_resolver.py +161 -0
- docker-compose.yml +79 -0
- downloader.py +400 -0
- example.py +250 -0
- file_cleanup.py +100 -0
- frontier.py +319 -0
- models.py +167 -0
- mongo_cleanup.py +86 -0
- parser.py +316 -0
- requirements.txt +43 -0
- robots.py +203 -0
- run_crawler.py +237 -0
- seo_analyzer_ui.py +708 -0
- storage.py +888 -0
- test_crawler.py +219 -0
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
DEPLOYMENT=true
|
Dockerfile
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
# Set working directory
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
# Install system dependencies
|
7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
8 |
+
build-essential \
|
9 |
+
wget \
|
10 |
+
curl \
|
11 |
+
gnupg \
|
12 |
+
&& rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
# Install MongoDB
|
15 |
+
RUN wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | apt-key add - \
|
16 |
+
&& echo "deb http://repo.mongodb.org/apt/debian buster/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \
|
17 |
+
&& apt-get update \
|
18 |
+
&& apt-get install -y mongodb-org \
|
19 |
+
&& mkdir -p /data/db \
|
20 |
+
&& apt-get clean \
|
21 |
+
&& rm -rf /var/lib/apt/lists/*
|
22 |
+
|
23 |
+
# Install Redis
|
24 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
25 |
+
redis-server \
|
26 |
+
&& apt-get clean \
|
27 |
+
&& rm -rf /var/lib/apt/lists/*
|
28 |
+
|
29 |
+
# Copy requirements.txt
|
30 |
+
COPY requirements.txt .
|
31 |
+
|
32 |
+
# Install Python dependencies
|
33 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
34 |
+
|
35 |
+
# Copy the crawler code
|
36 |
+
COPY . .
|
37 |
+
|
38 |
+
# Create necessary directories
|
39 |
+
RUN mkdir -p /data/storage/html_pages \
|
40 |
+
&& mkdir -p /data/storage/logs \
|
41 |
+
&& mkdir -p /data/storage/exports
|
42 |
+
|
43 |
+
# Expose ports
|
44 |
+
# Prometheus metrics port
|
45 |
+
EXPOSE 9100
|
46 |
+
# MongoDB port
|
47 |
+
EXPOSE 27017
|
48 |
+
# Redis port
|
49 |
+
EXPOSE 6379
|
50 |
+
|
51 |
+
# Set environment variables
|
52 |
+
ENV MONGODB_URI=mongodb://localhost:27017/
|
53 |
+
ENV REDIS_URI=redis://localhost:6379/0
|
54 |
+
ENV PYTHONUNBUFFERED=1
|
55 |
+
|
56 |
+
# Create entrypoint script
|
57 |
+
RUN echo '#!/bin/bash\n\
|
58 |
+
# Start MongoDB\n\
|
59 |
+
mongod --fork --logpath /var/log/mongodb.log\n\
|
60 |
+
\n\
|
61 |
+
# Start Redis\n\
|
62 |
+
redis-server --daemonize yes\n\
|
63 |
+
\n\
|
64 |
+
# Check if services are running\n\
|
65 |
+
echo "Waiting for MongoDB to start..."\n\
|
66 |
+
until mongo --eval "print(\"MongoDB is ready\")" > /dev/null 2>&1; do\n\
|
67 |
+
sleep 1\n\
|
68 |
+
done\n\
|
69 |
+
\n\
|
70 |
+
echo "Waiting for Redis to start..."\n\
|
71 |
+
until redis-cli ping > /dev/null 2>&1; do\n\
|
72 |
+
sleep 1\n\
|
73 |
+
done\n\
|
74 |
+
\n\
|
75 |
+
echo "All services are running!"\n\
|
76 |
+
\n\
|
77 |
+
# Execute the provided command or default to help\n\
|
78 |
+
if [ $# -eq 0 ]; then\n\
|
79 |
+
python crawl.py --help\n\
|
80 |
+
else\n\
|
81 |
+
exec "$@"\n\
|
82 |
+
fi' > /app/entrypoint.sh \
|
83 |
+
&& chmod +x /app/entrypoint.sh
|
84 |
+
|
85 |
+
# Set entrypoint
|
86 |
+
ENTRYPOINT ["/app/entrypoint.sh"]
|
87 |
+
|
88 |
+
# Default command is to show help
|
89 |
+
CMD ["python", "crawl.py", "--help"]
|
README.md
CHANGED
@@ -1,12 +1,344 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: green
|
5 |
-
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.30.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: AI_SEO_Crawler
|
3 |
+
app_file: seo_analyzer_ui.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.30.0
|
|
|
|
|
6 |
---
|
7 |
+
# Web Crawler Documentation
|
8 |
|
9 |
+
A scalable web crawler with configurability, politeness, and content extraction capabilities.
|
10 |
+
|
11 |
+
## Table of Contents
|
12 |
+
|
13 |
+
- [Architecture](#architecture)
|
14 |
+
- [Setup](#setup)
|
15 |
+
- [Usage](#usage)
|
16 |
+
- [Components](#components)
|
17 |
+
- [Troubleshooting](#troubleshooting)
|
18 |
+
|
19 |
+
## Architecture
|
20 |
+
|
21 |
+
The web crawler consists of the following key components:
|
22 |
+
|
23 |
+
1. **URL Frontier**: Manages URLs to be crawled with prioritization
|
24 |
+
2. **DNS Resolver**: Caches DNS lookups to improve performance
|
25 |
+
3. **Robots Handler**: Ensures compliance with robots.txt
|
26 |
+
4. **HTML Downloader**: Downloads web pages with error handling
|
27 |
+
5. **HTML Parser**: Extracts URLs and metadata from web pages
|
28 |
+
6. **Storage**: MongoDB for storage of URLs and metadata
|
29 |
+
7. **Crawler**: Main crawler orchestration
|
30 |
+
8. **API**: REST API for controlling the crawler
|
31 |
+
|
32 |
+
## Setup
|
33 |
+
|
34 |
+
### Requirements
|
35 |
+
|
36 |
+
- Python 3.8+
|
37 |
+
- MongoDB
|
38 |
+
- Redis server
|
39 |
+
|
40 |
+
### Installation
|
41 |
+
|
42 |
+
1. Install MongoDB:
|
43 |
+
```bash
|
44 |
+
# For Ubuntu
|
45 |
+
sudo apt-get install -y mongodb
|
46 |
+
sudo systemctl start mongodb
|
47 |
+
sudo systemctl enable mongodb
|
48 |
+
|
49 |
+
# Verify MongoDB is running
|
50 |
+
sudo systemctl status mongodb
|
51 |
+
```
|
52 |
+
|
53 |
+
2. Install Redis:
|
54 |
+
```bash
|
55 |
+
sudo apt-get install redis-server
|
56 |
+
sudo systemctl start redis-server
|
57 |
+
|
58 |
+
# Verify Redis is running
|
59 |
+
redis-cli ping # Should return PONG
|
60 |
+
```
|
61 |
+
|
62 |
+
3. Install Python dependencies:
|
63 |
+
```bash
|
64 |
+
pip install -r requirements.txt
|
65 |
+
```
|
66 |
+
|
67 |
+
4. Create a local configuration file:
|
68 |
+
```bash
|
69 |
+
cp config.py local_config.py
|
70 |
+
```
|
71 |
+
|
72 |
+
5. Edit `local_config.py` to customize settings:
|
73 |
+
```python
|
74 |
+
# Example configuration
|
75 |
+
SEED_URLS = ["https://example.com"] # Start URLs
|
76 |
+
MAX_DEPTH = 3 # Crawl depth
|
77 |
+
MAX_WORKERS = 4 # Number of worker threads
|
78 |
+
DELAY_BETWEEN_REQUESTS = 1 # Politeness delay
|
79 |
+
```
|
80 |
+
|
81 |
+
## Usage
|
82 |
+
|
83 |
+
### Running the Crawler
|
84 |
+
|
85 |
+
To run the crawler with default settings:
|
86 |
+
|
87 |
+
```bash
|
88 |
+
cd 4_web_crawler
|
89 |
+
python run_crawler.py
|
90 |
+
```
|
91 |
+
|
92 |
+
To specify custom seed URLs:
|
93 |
+
|
94 |
+
```bash
|
95 |
+
python run_crawler.py --seed https://example.com https://another-site.com
|
96 |
+
```
|
97 |
+
|
98 |
+
To limit crawl depth:
|
99 |
+
|
100 |
+
```bash
|
101 |
+
python run_crawler.py --depth 2
|
102 |
+
```
|
103 |
+
|
104 |
+
To run with more worker threads:
|
105 |
+
|
106 |
+
```bash
|
107 |
+
python run_crawler.py --workers 8
|
108 |
+
```
|
109 |
+
|
110 |
+
### Sample Commands
|
111 |
+
|
112 |
+
Here are some common use cases with sample commands:
|
113 |
+
|
114 |
+
#### Crawl a Single Domain
|
115 |
+
|
116 |
+
This command crawls only example.com, not following external links:
|
117 |
+
|
118 |
+
```bash
|
119 |
+
python run_crawler.py --seed example.com --domain-filter example.com
|
120 |
+
```
|
121 |
+
|
122 |
+
#### Fresh Start (Reset Database)
|
123 |
+
|
124 |
+
This clears both MongoDB and Redis before starting, solving duplicate key errors:
|
125 |
+
|
126 |
+
```bash
|
127 |
+
python run_crawler.py --seed example.com --reset-db
|
128 |
+
```
|
129 |
+
|
130 |
+
#### Custom Speed and Depth
|
131 |
+
|
132 |
+
Control the crawler's speed and depth:
|
133 |
+
|
134 |
+
```bash
|
135 |
+
python run_crawler.py --seed example.com --depth 3 --workers 4 --delay 0.5
|
136 |
+
```
|
137 |
+
|
138 |
+
#### Crawl Multiple Sites
|
139 |
+
|
140 |
+
Crawl multiple websites at once:
|
141 |
+
|
142 |
+
```bash
|
143 |
+
python run_crawler.py --seed example.com blog.example.org docs.example.com
|
144 |
+
```
|
145 |
+
|
146 |
+
#### Ignore robots.txt Rules
|
147 |
+
|
148 |
+
Use with caution, as this ignores website crawling policies:
|
149 |
+
|
150 |
+
```bash
|
151 |
+
python run_crawler.py --seed example.com --ignore-robots
|
152 |
+
```
|
153 |
+
|
154 |
+
#### Set Custom User Agent
|
155 |
+
|
156 |
+
Identity the crawler with a specific user agent:
|
157 |
+
|
158 |
+
```bash
|
159 |
+
python run_crawler.py --seed example.com --user-agent "MyCustomBot/1.0"
|
160 |
+
```
|
161 |
+
|
162 |
+
#### Crawl sagarnildas.com
|
163 |
+
|
164 |
+
To specifically crawl sagarnildas.com with optimal settings:
|
165 |
+
|
166 |
+
```bash
|
167 |
+
python run_crawler.py --seed sagarnildas.com --domain-filter sagarnildas.com --reset-db --workers 2 --depth 3 --verbose
|
168 |
+
```
|
169 |
+
|
170 |
+
### Using the API
|
171 |
+
|
172 |
+
The crawler provides a REST API for control and monitoring:
|
173 |
+
|
174 |
+
```bash
|
175 |
+
cd 4_web_crawler
|
176 |
+
python api.py
|
177 |
+
```
|
178 |
+
|
179 |
+
The API will be available at http://localhost:8000
|
180 |
+
|
181 |
+
#### API Endpoints
|
182 |
+
|
183 |
+
- `GET /status` - Get crawler status
|
184 |
+
- `GET /stats` - Get detailed statistics
|
185 |
+
- `POST /start` - Start the crawler
|
186 |
+
- `POST /stop` - Stop the crawler
|
187 |
+
- `POST /seed` - Add seed URLs
|
188 |
+
- `GET /pages` - List crawled pages
|
189 |
+
- `GET /urls` - List discovered URLs
|
190 |
+
|
191 |
+
### Checking Results
|
192 |
+
|
193 |
+
Monitor the crawler through:
|
194 |
+
|
195 |
+
1. Console output:
|
196 |
+
```bash
|
197 |
+
tail -f crawler.log
|
198 |
+
```
|
199 |
+
|
200 |
+
2. MongoDB collections:
|
201 |
+
```bash
|
202 |
+
# Start mongo shell
|
203 |
+
mongo
|
204 |
+
|
205 |
+
# Switch to crawler database
|
206 |
+
use crawler
|
207 |
+
|
208 |
+
# Count discovered URLs
|
209 |
+
db.urls.count()
|
210 |
+
|
211 |
+
# View crawled pages
|
212 |
+
db.pages.find().limit(5)
|
213 |
+
```
|
214 |
+
|
215 |
+
3. API statistics:
|
216 |
+
```bash
|
217 |
+
curl http://localhost:8000/stats
|
218 |
+
```
|
219 |
+
|
220 |
+
## Components
|
221 |
+
|
222 |
+
The crawler has several key components that work together:
|
223 |
+
|
224 |
+
### URL Frontier
|
225 |
+
|
226 |
+
Manages the queue of URLs to be crawled with priority-based scheduling.
|
227 |
+
|
228 |
+
### DNS Resolver
|
229 |
+
|
230 |
+
Caches DNS lookups to improve performance and reduce load on DNS servers.
|
231 |
+
|
232 |
+
### Robots Handler
|
233 |
+
|
234 |
+
Ensures compliance with robots.txt rules to be a good web citizen.
|
235 |
+
|
236 |
+
### HTML Downloader
|
237 |
+
|
238 |
+
Downloads web pages with error handling, timeouts, and retries.
|
239 |
+
|
240 |
+
### HTML Parser
|
241 |
+
|
242 |
+
Extracts URLs and metadata from web pages.
|
243 |
+
|
244 |
+
### Crawler
|
245 |
+
|
246 |
+
The main component that orchestrates the crawling process.
|
247 |
+
|
248 |
+
## Troubleshooting
|
249 |
+
|
250 |
+
### MongoDB Errors
|
251 |
+
|
252 |
+
If you see duplicate key errors:
|
253 |
+
|
254 |
+
```
|
255 |
+
ERROR: Error saving seed URL to database: E11000 duplicate key error
|
256 |
+
```
|
257 |
+
|
258 |
+
Clean MongoDB collections:
|
259 |
+
|
260 |
+
```bash
|
261 |
+
cd 4_web_crawler
|
262 |
+
python mongo_cleanup.py
|
263 |
+
```
|
264 |
+
|
265 |
+
### Redis Connection Issues
|
266 |
+
|
267 |
+
If the crawler can't connect to Redis:
|
268 |
+
|
269 |
+
1. Check if Redis is running:
|
270 |
+
```bash
|
271 |
+
sudo systemctl status redis-server
|
272 |
+
```
|
273 |
+
|
274 |
+
2. Verify Redis connection:
|
275 |
+
```bash
|
276 |
+
redis-cli ping
|
277 |
+
```
|
278 |
+
|
279 |
+
### Performance Issues
|
280 |
+
|
281 |
+
If the crawler is running slowly:
|
282 |
+
|
283 |
+
1. Increase worker threads in `local_config.py`:
|
284 |
+
```python
|
285 |
+
MAX_WORKERS = 8
|
286 |
+
```
|
287 |
+
|
288 |
+
2. Adjust the politeness delay:
|
289 |
+
```python
|
290 |
+
DELAY_BETWEEN_REQUESTS = 0.5 # Half-second delay
|
291 |
+
```
|
292 |
+
|
293 |
+
3. Optimize DNS caching:
|
294 |
+
```python
|
295 |
+
DNS_CACHE_SIZE = 10000
|
296 |
+
DNS_CACHE_TTL = 7200 # 2 hours
|
297 |
+
```
|
298 |
+
|
299 |
+
### Crawler Not Starting
|
300 |
+
|
301 |
+
If the crawler won't start:
|
302 |
+
|
303 |
+
1. Check for MongoDB connection:
|
304 |
+
```bash
|
305 |
+
mongo --eval "db.version()"
|
306 |
+
```
|
307 |
+
|
308 |
+
2. Ensure Redis is running:
|
309 |
+
```bash
|
310 |
+
redis-cli info
|
311 |
+
```
|
312 |
+
|
313 |
+
3. Look for error messages in the logs:
|
314 |
+
```bash
|
315 |
+
cat crawler.log
|
316 |
+
```
|
317 |
+
|
318 |
+
## Configuration Reference
|
319 |
+
|
320 |
+
Key configurations in `config.py` or `local_config.py`:
|
321 |
+
|
322 |
+
```python
|
323 |
+
# General settings
|
324 |
+
MAX_WORKERS = 4 # Number of worker threads
|
325 |
+
MAX_DEPTH = 3 # Maximum crawl depth
|
326 |
+
SEED_URLS = ["https://example.com"] # Initial URLs
|
327 |
+
|
328 |
+
# Politeness settings
|
329 |
+
RESPECT_ROBOTS_TXT = True # Whether to respect robots.txt
|
330 |
+
USER_AGENT = "MyBot/1.0" # User agent for requests
|
331 |
+
DELAY_BETWEEN_REQUESTS = 1 # Delay between requests to the same domain
|
332 |
+
|
333 |
+
# Storage settings
|
334 |
+
MONGODB_URI = "mongodb://localhost:27017/"
|
335 |
+
MONGODB_DB = "crawler"
|
336 |
+
|
337 |
+
# DNS settings
|
338 |
+
DNS_CACHE_SIZE = 10000
|
339 |
+
DNS_CACHE_TTL = 3600 # 1 hour
|
340 |
+
|
341 |
+
# Logging settings
|
342 |
+
LOG_LEVEL = "INFO"
|
343 |
+
LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
|
344 |
+
```
|
__pycache__/api.cpython-311.pyc
ADDED
Binary file (25.7 kB). View file
|
|
__pycache__/config.cpython-310.pyc
ADDED
Binary file (2.5 kB). View file
|
|
__pycache__/config.cpython-311.pyc
ADDED
Binary file (3.84 kB). View file
|
|
__pycache__/crawler.cpython-310.pyc
ADDED
Binary file (22.3 kB). View file
|
|
__pycache__/crawler.cpython-311.pyc
ADDED
Binary file (40.2 kB). View file
|
|
__pycache__/dns_resolver.cpython-310.pyc
ADDED
Binary file (4.7 kB). View file
|
|
__pycache__/dns_resolver.cpython-311.pyc
ADDED
Binary file (7.84 kB). View file
|
|
__pycache__/downloader.cpython-310.pyc
ADDED
Binary file (10.8 kB). View file
|
|
__pycache__/downloader.cpython-311.pyc
ADDED
Binary file (18.6 kB). View file
|
|
__pycache__/frontier.cpython-310.pyc
ADDED
Binary file (8.74 kB). View file
|
|
__pycache__/frontier.cpython-311.pyc
ADDED
Binary file (20.6 kB). View file
|
|
__pycache__/local_config.cpython-310.pyc
ADDED
Binary file (850 Bytes). View file
|
|
__pycache__/local_config.cpython-311.pyc
ADDED
Binary file (1.31 kB). View file
|
|
__pycache__/models.cpython-310.pyc
ADDED
Binary file (5.57 kB). View file
|
|
__pycache__/models.cpython-311.pyc
ADDED
Binary file (7.77 kB). View file
|
|
__pycache__/mongo_cleanup.cpython-310.pyc
ADDED
Binary file (2.27 kB). View file
|
|
__pycache__/mongo_cleanup.cpython-311.pyc
ADDED
Binary file (4.26 kB). View file
|
|
__pycache__/parser.cpython-310.pyc
ADDED
Binary file (7.95 kB). View file
|
|
__pycache__/parser.cpython-311.pyc
ADDED
Binary file (14.1 kB). View file
|
|
__pycache__/robots.cpython-310.pyc
ADDED
Binary file (4.75 kB). View file
|
|
__pycache__/robots.cpython-311.pyc
ADDED
Binary file (7.92 kB). View file
|
|
__pycache__/run_crawler.cpython-310.pyc
ADDED
Binary file (6.36 kB). View file
|
|
api.py
ADDED
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Web API for the web crawler.
|
3 |
+
|
4 |
+
This module provides a FastAPI-based web API for controlling and monitoring the web crawler.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
import json
|
11 |
+
import logging
|
12 |
+
import datetime
|
13 |
+
from typing import List, Dict, Any, Optional
|
14 |
+
from fastapi import FastAPI, HTTPException, Query, Path, BackgroundTasks, Depends
|
15 |
+
from fastapi.middleware.cors import CORSMiddleware
|
16 |
+
from fastapi.responses import JSONResponse
|
17 |
+
from pydantic import BaseModel, HttpUrl, Field
|
18 |
+
import uvicorn
|
19 |
+
|
20 |
+
from crawler import Crawler
|
21 |
+
from models import URL, URLStatus, Priority
|
22 |
+
import config
|
23 |
+
|
24 |
+
# Configure logging
|
25 |
+
logging.basicConfig(
|
26 |
+
level=getattr(logging, config.LOG_LEVEL),
|
27 |
+
format=config.LOG_FORMAT
|
28 |
+
)
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
+
# Create FastAPI app
|
32 |
+
app = FastAPI(
|
33 |
+
title="Web Crawler API",
|
34 |
+
description="API for controlling and monitoring the web crawler",
|
35 |
+
version="1.0.0"
|
36 |
+
)
|
37 |
+
|
38 |
+
# Enable CORS
|
39 |
+
app.add_middleware(
|
40 |
+
CORSMiddleware,
|
41 |
+
allow_origins=["*"],
|
42 |
+
allow_credentials=True,
|
43 |
+
allow_methods=["*"],
|
44 |
+
allow_headers=["*"],
|
45 |
+
)
|
46 |
+
|
47 |
+
# Global crawler instance
|
48 |
+
crawler = None
|
49 |
+
|
50 |
+
|
51 |
+
def get_crawler() -> Crawler:
|
52 |
+
"""Get or initialize the crawler instance"""
|
53 |
+
global crawler
|
54 |
+
if crawler is None:
|
55 |
+
crawler = Crawler()
|
56 |
+
return crawler
|
57 |
+
|
58 |
+
|
59 |
+
# API Models
|
60 |
+
class SeedURL(BaseModel):
|
61 |
+
url: HttpUrl
|
62 |
+
priority: Optional[str] = Field(
|
63 |
+
default="NORMAL",
|
64 |
+
description="URL priority (VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW)"
|
65 |
+
)
|
66 |
+
|
67 |
+
|
68 |
+
class SeedURLs(BaseModel):
|
69 |
+
urls: List[SeedURL]
|
70 |
+
|
71 |
+
|
72 |
+
class CrawlerStatus(BaseModel):
|
73 |
+
running: bool
|
74 |
+
paused: bool
|
75 |
+
start_time: Optional[float] = None
|
76 |
+
uptime_seconds: Optional[float] = None
|
77 |
+
pages_crawled: int
|
78 |
+
pages_failed: int
|
79 |
+
urls_discovered: int
|
80 |
+
urls_filtered: int
|
81 |
+
domains_crawled: int
|
82 |
+
frontier_size: int
|
83 |
+
|
84 |
+
|
85 |
+
class CrawlerConfig(BaseModel):
|
86 |
+
max_depth: int = Field(..., description="Maximum crawl depth")
|
87 |
+
max_workers: int = Field(..., description="Maximum number of worker threads")
|
88 |
+
delay_between_requests: float = Field(..., description="Delay between requests to the same domain (seconds)")
|
89 |
+
|
90 |
+
|
91 |
+
class PageDetail(BaseModel):
|
92 |
+
url: str
|
93 |
+
domain: str
|
94 |
+
status_code: int
|
95 |
+
content_type: str
|
96 |
+
content_length: int
|
97 |
+
crawled_at: str
|
98 |
+
is_seed: bool
|
99 |
+
depth: int
|
100 |
+
title: Optional[str] = None
|
101 |
+
description: Optional[str] = None
|
102 |
+
|
103 |
+
|
104 |
+
class URLDetail(BaseModel):
|
105 |
+
url: str
|
106 |
+
normalized_url: str
|
107 |
+
domain: str
|
108 |
+
status: str
|
109 |
+
priority: str
|
110 |
+
depth: int
|
111 |
+
parent_url: Optional[str] = None
|
112 |
+
last_crawled: Optional[str] = None
|
113 |
+
error: Optional[str] = None
|
114 |
+
retries: int
|
115 |
+
|
116 |
+
|
117 |
+
class DomainStats(BaseModel):
|
118 |
+
domain: str
|
119 |
+
pages_count: int
|
120 |
+
successful_requests: int
|
121 |
+
failed_requests: int
|
122 |
+
avg_page_size: float
|
123 |
+
content_types: Dict[str, int]
|
124 |
+
status_codes: Dict[str, int]
|
125 |
+
|
126 |
+
|
127 |
+
# API Routes
|
128 |
+
@app.get("/")
|
129 |
+
async def read_root():
|
130 |
+
"""Root endpoint"""
|
131 |
+
return {
|
132 |
+
"name": "Web Crawler API",
|
133 |
+
"version": "1.0.0",
|
134 |
+
"description": "API for controlling and monitoring the web crawler",
|
135 |
+
"endpoints": {
|
136 |
+
"GET /": "This help message",
|
137 |
+
"GET /status": "Get crawler status",
|
138 |
+
"GET /stats": "Get crawler statistics",
|
139 |
+
"GET /config": "Get crawler configuration",
|
140 |
+
"PUT /config": "Update crawler configuration",
|
141 |
+
"POST /start": "Start the crawler",
|
142 |
+
"POST /stop": "Stop the crawler",
|
143 |
+
"POST /pause": "Pause the crawler",
|
144 |
+
"POST /resume": "Resume the crawler",
|
145 |
+
"GET /pages": "List crawled pages",
|
146 |
+
"GET /pages/{url}": "Get page details",
|
147 |
+
"GET /urls": "List discovered URLs",
|
148 |
+
"GET /urls/{url}": "Get URL details",
|
149 |
+
"POST /seed": "Add seed URLs",
|
150 |
+
"GET /domains": "Get domain statistics",
|
151 |
+
"GET /domains/{domain}": "Get statistics for a specific domain",
|
152 |
+
}
|
153 |
+
}
|
154 |
+
|
155 |
+
|
156 |
+
@app.get("/status", response_model=CrawlerStatus)
|
157 |
+
async def get_status(crawler: Crawler = Depends(get_crawler)):
|
158 |
+
"""Get crawler status"""
|
159 |
+
status = {
|
160 |
+
"running": crawler.running,
|
161 |
+
"paused": crawler.paused,
|
162 |
+
"start_time": crawler.stats.get('start_time'),
|
163 |
+
"uptime_seconds": time.time() - crawler.stats.get('start_time', time.time()) if crawler.running else None,
|
164 |
+
"pages_crawled": crawler.stats.get('pages_crawled', 0),
|
165 |
+
"pages_failed": crawler.stats.get('pages_failed', 0),
|
166 |
+
"urls_discovered": crawler.stats.get('urls_discovered', 0),
|
167 |
+
"urls_filtered": crawler.stats.get('urls_filtered', 0),
|
168 |
+
"domains_crawled": len(crawler.stats.get('domains_crawled', set())),
|
169 |
+
"frontier_size": crawler.frontier.size()
|
170 |
+
}
|
171 |
+
return status
|
172 |
+
|
173 |
+
|
174 |
+
@app.get("/stats")
|
175 |
+
async def get_stats(crawler: Crawler = Depends(get_crawler)):
|
176 |
+
"""Get detailed crawler statistics"""
|
177 |
+
stats = crawler.stats.copy()
|
178 |
+
|
179 |
+
# Convert sets to lists for JSON serialization
|
180 |
+
for key, value in stats.items():
|
181 |
+
if isinstance(value, set):
|
182 |
+
stats[key] = list(value)
|
183 |
+
|
184 |
+
# Add uptime
|
185 |
+
if stats.get('start_time'):
|
186 |
+
stats['uptime_seconds'] = time.time() - stats['start_time']
|
187 |
+
stats['uptime_formatted'] = str(datetime.timedelta(seconds=int(stats['uptime_seconds'])))
|
188 |
+
|
189 |
+
# Add DNS cache statistics if available
|
190 |
+
try:
|
191 |
+
dns_stats = crawler.dns_resolver.get_stats()
|
192 |
+
stats['dns_cache'] = dns_stats
|
193 |
+
except (AttributeError, Exception) as e:
|
194 |
+
logger.warning(f"Failed to get DNS stats: {e}")
|
195 |
+
stats['dns_cache'] = {'error': 'Stats not available'}
|
196 |
+
|
197 |
+
# Add frontier statistics if available
|
198 |
+
try:
|
199 |
+
stats['frontier_size'] = crawler.frontier.size()
|
200 |
+
if hasattr(crawler.frontier, 'get_stats'):
|
201 |
+
frontier_stats = crawler.frontier.get_stats()
|
202 |
+
stats['frontier'] = frontier_stats
|
203 |
+
else:
|
204 |
+
stats['frontier'] = {'size': crawler.frontier.size()}
|
205 |
+
except Exception as e:
|
206 |
+
logger.warning(f"Failed to get frontier stats: {e}")
|
207 |
+
stats['frontier'] = {'error': 'Stats not available'}
|
208 |
+
|
209 |
+
return stats
|
210 |
+
|
211 |
+
|
212 |
+
@app.get("/config", response_model=CrawlerConfig)
|
213 |
+
async def get_config():
|
214 |
+
"""Get crawler configuration"""
|
215 |
+
return {
|
216 |
+
"max_depth": config.MAX_DEPTH,
|
217 |
+
"max_workers": config.MAX_WORKERS,
|
218 |
+
"delay_between_requests": config.DELAY_BETWEEN_REQUESTS
|
219 |
+
}
|
220 |
+
|
221 |
+
|
222 |
+
@app.put("/config", response_model=CrawlerConfig)
|
223 |
+
async def update_config(
|
224 |
+
crawler_config: CrawlerConfig,
|
225 |
+
crawler: Crawler = Depends(get_crawler)
|
226 |
+
):
|
227 |
+
"""Update crawler configuration"""
|
228 |
+
# Update configuration
|
229 |
+
config.MAX_DEPTH = crawler_config.max_depth
|
230 |
+
config.MAX_WORKERS = crawler_config.max_workers
|
231 |
+
config.DELAY_BETWEEN_REQUESTS = crawler_config.delay_between_requests
|
232 |
+
|
233 |
+
return crawler_config
|
234 |
+
|
235 |
+
|
236 |
+
@app.post("/start")
|
237 |
+
async def start_crawler(
|
238 |
+
background_tasks: BackgroundTasks,
|
239 |
+
num_workers: int = Query(None, description="Number of worker threads"),
|
240 |
+
async_mode: bool = Query(False, description="Whether to use async mode"),
|
241 |
+
crawler: Crawler = Depends(get_crawler)
|
242 |
+
):
|
243 |
+
"""Start the crawler"""
|
244 |
+
if crawler.running:
|
245 |
+
return {"status": "Crawler is already running"}
|
246 |
+
|
247 |
+
# Start crawler in background
|
248 |
+
def start_crawler_task():
|
249 |
+
try:
|
250 |
+
crawler.start(num_workers=num_workers, async_mode=async_mode)
|
251 |
+
except Exception as e:
|
252 |
+
logger.error(f"Error starting crawler: {e}")
|
253 |
+
|
254 |
+
background_tasks.add_task(start_crawler_task)
|
255 |
+
|
256 |
+
return {"status": "Crawler starting in background"}
|
257 |
+
|
258 |
+
|
259 |
+
@app.post("/stop")
|
260 |
+
async def stop_crawler(crawler: Crawler = Depends(get_crawler)):
|
261 |
+
"""Stop the crawler"""
|
262 |
+
if not crawler.running:
|
263 |
+
return {"status": "Crawler is not running"}
|
264 |
+
|
265 |
+
crawler.stop()
|
266 |
+
return {"status": "Crawler stopped"}
|
267 |
+
|
268 |
+
|
269 |
+
@app.post("/pause")
|
270 |
+
async def pause_crawler(crawler: Crawler = Depends(get_crawler)):
|
271 |
+
"""Pause the crawler"""
|
272 |
+
if not crawler.running:
|
273 |
+
return {"status": "Crawler is not running"}
|
274 |
+
|
275 |
+
if crawler.paused:
|
276 |
+
return {"status": "Crawler is already paused"}
|
277 |
+
|
278 |
+
crawler.pause()
|
279 |
+
return {"status": "Crawler paused"}
|
280 |
+
|
281 |
+
|
282 |
+
@app.post("/resume")
|
283 |
+
async def resume_crawler(crawler: Crawler = Depends(get_crawler)):
|
284 |
+
"""Resume the crawler"""
|
285 |
+
if not crawler.running:
|
286 |
+
return {"status": "Crawler is not running"}
|
287 |
+
|
288 |
+
if not crawler.paused:
|
289 |
+
return {"status": "Crawler is not paused"}
|
290 |
+
|
291 |
+
crawler.resume()
|
292 |
+
return {"status": "Crawler resumed"}
|
293 |
+
|
294 |
+
|
295 |
+
@app.get("/pages")
|
296 |
+
async def list_pages(
|
297 |
+
limit: int = Query(10, ge=1, le=100, description="Number of pages to return"),
|
298 |
+
offset: int = Query(0, ge=0, description="Offset for pagination"),
|
299 |
+
domain: Optional[str] = Query(None, description="Filter by domain"),
|
300 |
+
status_code: Optional[int] = Query(None, description="Filter by HTTP status code"),
|
301 |
+
crawler: Crawler = Depends(get_crawler)
|
302 |
+
):
|
303 |
+
"""List crawled pages"""
|
304 |
+
# Build query
|
305 |
+
query = {}
|
306 |
+
if domain:
|
307 |
+
query['domain'] = domain
|
308 |
+
if status_code:
|
309 |
+
query['status_code'] = status_code
|
310 |
+
|
311 |
+
# Execute query
|
312 |
+
try:
|
313 |
+
pages = list(crawler.db.pages_collection.find(
|
314 |
+
query,
|
315 |
+
{'_id': 0}
|
316 |
+
).skip(offset).limit(limit))
|
317 |
+
|
318 |
+
# Count total pages matching query
|
319 |
+
total_count = crawler.db.pages_collection.count_documents(query)
|
320 |
+
|
321 |
+
return {
|
322 |
+
"pages": pages,
|
323 |
+
"total": total_count,
|
324 |
+
"limit": limit,
|
325 |
+
"offset": offset
|
326 |
+
}
|
327 |
+
except Exception as e:
|
328 |
+
logger.error(f"Error listing pages: {e}")
|
329 |
+
raise HTTPException(status_code=500, detail=str(e))
|
330 |
+
|
331 |
+
|
332 |
+
@app.get("/pages/{url:path}", response_model=PageDetail)
|
333 |
+
async def get_page(
|
334 |
+
url: str,
|
335 |
+
include_content: bool = Query(False, description="Include page content"),
|
336 |
+
crawler: Crawler = Depends(get_crawler)
|
337 |
+
):
|
338 |
+
"""Get page details"""
|
339 |
+
try:
|
340 |
+
# Decode URL from path parameter
|
341 |
+
url = url.replace("___", "/")
|
342 |
+
|
343 |
+
# Find page in database
|
344 |
+
page = crawler.db.pages_collection.find_one({'url': url}, {'_id': 0})
|
345 |
+
|
346 |
+
if not page:
|
347 |
+
raise HTTPException(status_code=404, detail="Page not found")
|
348 |
+
|
349 |
+
# Load content if requested
|
350 |
+
if include_content:
|
351 |
+
try:
|
352 |
+
if crawler.use_s3:
|
353 |
+
content = crawler._load_content_s3(url)
|
354 |
+
else:
|
355 |
+
content = crawler._load_content_disk(url)
|
356 |
+
|
357 |
+
if content:
|
358 |
+
page['content'] = content
|
359 |
+
except Exception as e:
|
360 |
+
logger.error(f"Error loading content for {url}: {e}")
|
361 |
+
page['content'] = None
|
362 |
+
|
363 |
+
return page
|
364 |
+
except HTTPException:
|
365 |
+
raise
|
366 |
+
except Exception as e:
|
367 |
+
logger.error(f"Error getting page {url}: {e}")
|
368 |
+
raise HTTPException(status_code=500, detail=str(e))
|
369 |
+
|
370 |
+
|
371 |
+
@app.get("/urls")
|
372 |
+
async def list_urls(
|
373 |
+
limit: int = Query(10, ge=1, le=100, description="Number of URLs to return"),
|
374 |
+
offset: int = Query(0, ge=0, description="Offset for pagination"),
|
375 |
+
status: Optional[str] = Query(None, description="Filter by URL status"),
|
376 |
+
domain: Optional[str] = Query(None, description="Filter by domain"),
|
377 |
+
priority: Optional[str] = Query(None, description="Filter by priority"),
|
378 |
+
crawler: Crawler = Depends(get_crawler)
|
379 |
+
):
|
380 |
+
"""List discovered URLs"""
|
381 |
+
# Build query
|
382 |
+
query = {}
|
383 |
+
if status:
|
384 |
+
query['status'] = status
|
385 |
+
if domain:
|
386 |
+
query['domain'] = domain
|
387 |
+
if priority:
|
388 |
+
query['priority'] = priority
|
389 |
+
|
390 |
+
# Execute query
|
391 |
+
try:
|
392 |
+
urls = list(crawler.db.urls_collection.find(
|
393 |
+
query,
|
394 |
+
{'_id': 0}
|
395 |
+
).skip(offset).limit(limit))
|
396 |
+
|
397 |
+
# Count total URLs matching query
|
398 |
+
total_count = crawler.db.urls_collection.count_documents(query)
|
399 |
+
|
400 |
+
return {
|
401 |
+
"urls": urls,
|
402 |
+
"total": total_count,
|
403 |
+
"limit": limit,
|
404 |
+
"offset": offset
|
405 |
+
}
|
406 |
+
except Exception as e:
|
407 |
+
logger.error(f"Error listing URLs: {e}")
|
408 |
+
raise HTTPException(status_code=500, detail=str(e))
|
409 |
+
|
410 |
+
|
411 |
+
@app.get("/urls/{url:path}", response_model=URLDetail)
|
412 |
+
async def get_url(
|
413 |
+
url: str,
|
414 |
+
crawler: Crawler = Depends(get_crawler)
|
415 |
+
):
|
416 |
+
"""Get URL details"""
|
417 |
+
try:
|
418 |
+
# Decode URL from path parameter
|
419 |
+
url = url.replace("___", "/")
|
420 |
+
|
421 |
+
# Find URL in database
|
422 |
+
url_obj = crawler.db.urls_collection.find_one({'url': url}, {'_id': 0})
|
423 |
+
|
424 |
+
if not url_obj:
|
425 |
+
raise HTTPException(status_code=404, detail="URL not found")
|
426 |
+
|
427 |
+
return url_obj
|
428 |
+
except HTTPException:
|
429 |
+
raise
|
430 |
+
except Exception as e:
|
431 |
+
logger.error(f"Error getting URL {url}: {e}")
|
432 |
+
raise HTTPException(status_code=500, detail=str(e))
|
433 |
+
|
434 |
+
|
435 |
+
@app.post("/seed")
|
436 |
+
async def add_seed_urls(
|
437 |
+
seed_urls: SeedURLs,
|
438 |
+
crawler: Crawler = Depends(get_crawler)
|
439 |
+
):
|
440 |
+
"""Add seed URLs to the frontier"""
|
441 |
+
try:
|
442 |
+
urls_added = 0
|
443 |
+
for seed in seed_urls.urls:
|
444 |
+
url = str(seed.url)
|
445 |
+
priority = getattr(Priority, seed.priority, Priority.NORMAL)
|
446 |
+
|
447 |
+
# Create URL object
|
448 |
+
url_obj = URL(
|
449 |
+
url=url,
|
450 |
+
status=URLStatus.PENDING,
|
451 |
+
priority=priority,
|
452 |
+
depth=0 # Seed URLs are at depth 0
|
453 |
+
)
|
454 |
+
|
455 |
+
# Add to frontier
|
456 |
+
if crawler.frontier.add_url(url_obj):
|
457 |
+
# Save URL to database
|
458 |
+
crawler.urls_collection.update_one(
|
459 |
+
{'url': url},
|
460 |
+
{'$set': url_obj.dict()},
|
461 |
+
upsert=True
|
462 |
+
)
|
463 |
+
|
464 |
+
urls_added += 1
|
465 |
+
logger.info(f"Added seed URL: {url}")
|
466 |
+
|
467 |
+
return {"status": "success", "urls_added": urls_added}
|
468 |
+
except Exception as e:
|
469 |
+
logger.error(f"Error adding seed URLs: {e}")
|
470 |
+
raise HTTPException(status_code=500, detail=str(e))
|
471 |
+
|
472 |
+
|
473 |
+
@app.get("/domains")
|
474 |
+
async def list_domains(
|
475 |
+
limit: int = Query(10, ge=1, le=100, description="Number of domains to return"),
|
476 |
+
offset: int = Query(0, ge=0, description="Offset for pagination"),
|
477 |
+
crawler: Crawler = Depends(get_crawler)
|
478 |
+
):
|
479 |
+
"""Get domain statistics"""
|
480 |
+
try:
|
481 |
+
# Get domains with counts
|
482 |
+
domain_counts = crawler.db.pages_collection.aggregate([
|
483 |
+
{"$group": {
|
484 |
+
"_id": "$domain",
|
485 |
+
"pages_count": {"$sum": 1},
|
486 |
+
"avg_page_size": {"$avg": "$content_length"}
|
487 |
+
}},
|
488 |
+
{"$sort": {"pages_count": -1}},
|
489 |
+
{"$skip": offset},
|
490 |
+
{"$limit": limit}
|
491 |
+
])
|
492 |
+
|
493 |
+
# Get total domains count
|
494 |
+
total_domains = len(crawler.stats.get('domains_crawled', set()))
|
495 |
+
|
496 |
+
# Format result
|
497 |
+
domains = []
|
498 |
+
for domain in domain_counts:
|
499 |
+
domains.append({
|
500 |
+
"domain": domain["_id"],
|
501 |
+
"pages_count": domain["pages_count"],
|
502 |
+
"avg_page_size": domain["avg_page_size"]
|
503 |
+
})
|
504 |
+
|
505 |
+
return {
|
506 |
+
"domains": domains,
|
507 |
+
"total": total_domains,
|
508 |
+
"limit": limit,
|
509 |
+
"offset": offset
|
510 |
+
}
|
511 |
+
except Exception as e:
|
512 |
+
logger.error(f"Error listing domains: {e}")
|
513 |
+
raise HTTPException(status_code=500, detail=str(e))
|
514 |
+
|
515 |
+
|
516 |
+
@app.get("/domains/{domain}", response_model=DomainStats)
|
517 |
+
async def get_domain_stats(
|
518 |
+
domain: str,
|
519 |
+
crawler: Crawler = Depends(get_crawler)
|
520 |
+
):
|
521 |
+
"""Get statistics for a specific domain"""
|
522 |
+
try:
|
523 |
+
# Get basic domain stats
|
524 |
+
domain_stats = crawler.db.pages_collection.aggregate([
|
525 |
+
{"$match": {"domain": domain}},
|
526 |
+
{"$group": {
|
527 |
+
"_id": "$domain",
|
528 |
+
"pages_count": {"$sum": 1},
|
529 |
+
"successful_requests": {"$sum": {"$cond": [{"$lt": ["$status_code", 400]}, 1, 0]}},
|
530 |
+
"failed_requests": {"$sum": {"$cond": [{"$gte": ["$status_code", 400]}, 1, 0]}},
|
531 |
+
"avg_page_size": {"$avg": "$content_length"}
|
532 |
+
}}
|
533 |
+
]).next()
|
534 |
+
|
535 |
+
# Get content type distribution
|
536 |
+
content_types = crawler.db.pages_collection.aggregate([
|
537 |
+
{"$match": {"domain": domain}},
|
538 |
+
{"$group": {
|
539 |
+
"_id": "$content_type",
|
540 |
+
"count": {"$sum": 1}
|
541 |
+
}}
|
542 |
+
])
|
543 |
+
|
544 |
+
content_type_map = {}
|
545 |
+
for ct in content_types:
|
546 |
+
content_type_map[ct["_id"]] = ct["count"]
|
547 |
+
|
548 |
+
# Get status code distribution
|
549 |
+
status_codes = crawler.db.pages_collection.aggregate([
|
550 |
+
{"$match": {"domain": domain}},
|
551 |
+
{"$group": {
|
552 |
+
"_id": "$status_code",
|
553 |
+
"count": {"$sum": 1}
|
554 |
+
}}
|
555 |
+
])
|
556 |
+
|
557 |
+
status_code_map = {}
|
558 |
+
for sc in status_codes:
|
559 |
+
status_code_map[str(sc["_id"])] = sc["count"]
|
560 |
+
|
561 |
+
# Format result
|
562 |
+
result = {
|
563 |
+
"domain": domain,
|
564 |
+
"pages_count": domain_stats["pages_count"],
|
565 |
+
"successful_requests": domain_stats["successful_requests"],
|
566 |
+
"failed_requests": domain_stats["failed_requests"],
|
567 |
+
"avg_page_size": domain_stats["avg_page_size"],
|
568 |
+
"content_types": content_type_map,
|
569 |
+
"status_codes": status_code_map
|
570 |
+
}
|
571 |
+
|
572 |
+
return result
|
573 |
+
except StopIteration:
|
574 |
+
# Domain not found
|
575 |
+
raise HTTPException(status_code=404, detail=f"Domain '{domain}' not found")
|
576 |
+
except Exception as e:
|
577 |
+
logger.error(f"Error getting domain stats for {domain}: {e}")
|
578 |
+
raise HTTPException(status_code=500, detail=str(e))
|
579 |
+
|
580 |
+
|
581 |
+
if __name__ == "__main__":
|
582 |
+
# Run the API server
|
583 |
+
uvicorn.run(
|
584 |
+
"api:app",
|
585 |
+
host="0.0.0.0",
|
586 |
+
port=8000,
|
587 |
+
reload=True
|
588 |
+
)
|
cleanup.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Cleanup script to remove all web crawler data from MongoDB
|
4 |
+
and list files to be removed
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import logging
|
10 |
+
import shutil
|
11 |
+
from pymongo import MongoClient
|
12 |
+
|
13 |
+
# Configure logging
|
14 |
+
logging.basicConfig(
|
15 |
+
level=logging.INFO,
|
16 |
+
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
|
17 |
+
)
|
18 |
+
logger = logging.getLogger("cleanup")
|
19 |
+
|
20 |
+
def cleanup_mongodb():
|
21 |
+
"""Remove all web crawler data from MongoDB"""
|
22 |
+
try:
|
23 |
+
# Connect to MongoDB
|
24 |
+
logger.info("Connecting to MongoDB...")
|
25 |
+
client = MongoClient("mongodb://localhost:27017/")
|
26 |
+
|
27 |
+
# Access crawler database
|
28 |
+
db = client["crawler"]
|
29 |
+
|
30 |
+
# List and drop all collections
|
31 |
+
collections = db.list_collection_names()
|
32 |
+
|
33 |
+
if not collections:
|
34 |
+
logger.info("No collections found in the crawler database")
|
35 |
+
else:
|
36 |
+
logger.info(f"Found {len(collections)} collections to drop: {collections}")
|
37 |
+
|
38 |
+
for collection in collections:
|
39 |
+
logger.info(f"Dropping collection: {collection}")
|
40 |
+
db[collection].drop()
|
41 |
+
|
42 |
+
logger.info("All crawler collections dropped successfully")
|
43 |
+
|
44 |
+
# Optional: Drop the entire database
|
45 |
+
# client.drop_database("crawler")
|
46 |
+
# logger.info("Dropped entire crawler database")
|
47 |
+
|
48 |
+
logger.info("MongoDB cleanup completed")
|
49 |
+
|
50 |
+
except Exception as e:
|
51 |
+
logger.error(f"Error cleaning up MongoDB: {e}")
|
52 |
+
return False
|
53 |
+
|
54 |
+
return True
|
55 |
+
|
56 |
+
def cleanup_files():
|
57 |
+
"""List and remove files related to simple_crawler"""
|
58 |
+
try:
|
59 |
+
crawler_dir = os.path.dirname(os.path.abspath(__file__))
|
60 |
+
|
61 |
+
# Files directly related to simple_crawler
|
62 |
+
simple_crawler_files = [
|
63 |
+
os.path.join(crawler_dir, "simple_crawler.py"),
|
64 |
+
os.path.join(crawler_dir, "README_SIMPLE.md"),
|
65 |
+
os.path.join(crawler_dir, "simple_crawler.log")
|
66 |
+
]
|
67 |
+
|
68 |
+
# Check storage directories
|
69 |
+
storage_dir = os.path.join(crawler_dir, "storage")
|
70 |
+
if os.path.exists(storage_dir):
|
71 |
+
logger.info(f"Will remove storage directory: {storage_dir}")
|
72 |
+
simple_crawler_files.append(storage_dir)
|
73 |
+
|
74 |
+
# List all files that will be removed
|
75 |
+
logger.info("The following files will be removed:")
|
76 |
+
for file_path in simple_crawler_files:
|
77 |
+
if os.path.exists(file_path):
|
78 |
+
logger.info(f" - {file_path}")
|
79 |
+
else:
|
80 |
+
logger.info(f" - {file_path} (not found)")
|
81 |
+
|
82 |
+
# Confirm removal
|
83 |
+
confirm = input("Do you want to proceed with removal? (y/n): ")
|
84 |
+
if confirm.lower() != 'y':
|
85 |
+
logger.info("File removal cancelled")
|
86 |
+
return False
|
87 |
+
|
88 |
+
# Remove files and directories
|
89 |
+
for file_path in simple_crawler_files:
|
90 |
+
if os.path.exists(file_path):
|
91 |
+
if os.path.isdir(file_path):
|
92 |
+
logger.info(f"Removing directory: {file_path}")
|
93 |
+
shutil.rmtree(file_path)
|
94 |
+
else:
|
95 |
+
logger.info(f"Removing file: {file_path}")
|
96 |
+
os.remove(file_path)
|
97 |
+
|
98 |
+
logger.info("File cleanup completed")
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
logger.error(f"Error cleaning up files: {e}")
|
102 |
+
return False
|
103 |
+
|
104 |
+
return True
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
print("Web Crawler Cleanup Utility")
|
108 |
+
print("---------------------------")
|
109 |
+
print("This script will:")
|
110 |
+
print("1. Remove all web crawler collections from MongoDB")
|
111 |
+
print("2. List and remove files related to simple_crawler")
|
112 |
+
print()
|
113 |
+
|
114 |
+
proceed = input("Do you want to proceed? (y/n): ")
|
115 |
+
if proceed.lower() != 'y':
|
116 |
+
print("Cleanup cancelled")
|
117 |
+
sys.exit(0)
|
118 |
+
|
119 |
+
# Clean up MongoDB
|
120 |
+
print("\nStep 1: Cleaning up MongoDB...")
|
121 |
+
mongo_success = cleanup_mongodb()
|
122 |
+
|
123 |
+
# Clean up files
|
124 |
+
print("\nStep 2: Cleaning up files...")
|
125 |
+
files_success = cleanup_files()
|
126 |
+
|
127 |
+
# Summary
|
128 |
+
print("\nCleanup Summary:")
|
129 |
+
print(f"MongoDB cleanup: {'Completed' if mongo_success else 'Failed'}")
|
130 |
+
print(f"File cleanup: {'Completed' if files_success else 'Failed'}")
|
cleanup_all.sh
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Master cleanup script for web crawler - runs both MongoDB and file cleanup
|
3 |
+
|
4 |
+
set -e # Exit on error
|
5 |
+
|
6 |
+
echo "====================================================="
|
7 |
+
echo " WEB CRAWLER COMPLETE CLEANUP "
|
8 |
+
echo "====================================================="
|
9 |
+
echo
|
10 |
+
|
11 |
+
# Get script directory
|
12 |
+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
13 |
+
cd "$SCRIPT_DIR"
|
14 |
+
|
15 |
+
# Check if scripts exist
|
16 |
+
if [ ! -f "./mongo_cleanup.py" ] || [ ! -f "./file_cleanup.py" ]; then
|
17 |
+
echo "Error: Required cleanup scripts not found in $SCRIPT_DIR"
|
18 |
+
exit 1
|
19 |
+
fi
|
20 |
+
|
21 |
+
# Ensure scripts are executable
|
22 |
+
chmod +x ./mongo_cleanup.py
|
23 |
+
chmod +x ./file_cleanup.py
|
24 |
+
|
25 |
+
# Step 1: MongoDB cleanup
|
26 |
+
echo "Step 1: MongoDB Cleanup"
|
27 |
+
echo "----------------------"
|
28 |
+
if [ "$1" == "--force" ]; then
|
29 |
+
python3 ./mongo_cleanup.py --force
|
30 |
+
else
|
31 |
+
python3 ./mongo_cleanup.py
|
32 |
+
fi
|
33 |
+
|
34 |
+
# Step 2: File cleanup
|
35 |
+
echo
|
36 |
+
echo "Step 2: File Cleanup"
|
37 |
+
echo "------------------"
|
38 |
+
if [ "$1" == "--force" ]; then
|
39 |
+
python3 ./file_cleanup.py --force
|
40 |
+
else
|
41 |
+
python3 ./file_cleanup.py
|
42 |
+
fi
|
43 |
+
|
44 |
+
echo
|
45 |
+
echo "====================================================="
|
46 |
+
echo " CLEANUP PROCESS COMPLETED "
|
47 |
+
echo "====================================================="
|
config.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration settings for the web crawler
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
from typing import Dict, List, Any, Optional
|
7 |
+
|
8 |
+
# General settings
|
9 |
+
MAX_WORKERS = 100 # Maximum number of worker threads/processes
|
10 |
+
MAX_DEPTH = 10 # Maximum depth to crawl from seed URLs
|
11 |
+
CRAWL_TIMEOUT = 30 # Timeout for HTTP requests in seconds
|
12 |
+
USER_AGENT = "Mozilla/5.0 WebCrawler/1.0 (+https://example.org/bot)"
|
13 |
+
|
14 |
+
# Politeness settings
|
15 |
+
ROBOTSTXT_OBEY = True # Whether to obey robots.txt rules
|
16 |
+
DOWNLOAD_DELAY = 1.0 # Delay between requests to the same domain (seconds)
|
17 |
+
MAX_REQUESTS_PER_DOMAIN = 10 # Maximum concurrent requests per domain
|
18 |
+
RESPECT_CRAWL_DELAY = True # Respect Crawl-delay in robots.txt
|
19 |
+
RETRY_TIMES = 3 # Number of retries for failed requests
|
20 |
+
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429] # HTTP codes to retry
|
21 |
+
|
22 |
+
# URL settings
|
23 |
+
ALLOWED_DOMAINS: Optional[List[str]] = None # Domains to restrict crawling to (None = all domains)
|
24 |
+
EXCLUDED_DOMAINS: List[str] = [] # Domains to exclude from crawling
|
25 |
+
ALLOWED_SCHEMES = ["http", "https"] # URL schemes to allow
|
26 |
+
URL_FILTERS = [
|
27 |
+
# Only filter out binary and media files
|
28 |
+
r".*\.(jpg|jpeg|gif|png|ico|mp3|mp4|wav|avi|mov|mpeg|pdf|zip|rar|gz|exe|dmg|pkg|iso|bin)$",
|
29 |
+
] # Regex patterns to filter out URLs
|
30 |
+
|
31 |
+
# Storage settings
|
32 |
+
MONGODB_URI = "mongodb://localhost:27017/"
|
33 |
+
MONGODB_DB = "webcrawler"
|
34 |
+
REDIS_URI = "redis://localhost:6379/0"
|
35 |
+
STORAGE_PATH = os.path.join(os.path.dirname(__file__), "storage")
|
36 |
+
HTML_STORAGE_PATH = os.path.join(STORAGE_PATH, "html")
|
37 |
+
LOG_PATH = os.path.join(STORAGE_PATH, "logs")
|
38 |
+
|
39 |
+
# Frontier settings
|
40 |
+
FRONTIER_QUEUE_SIZE = 100000 # Maximum number of URLs in the frontier queue
|
41 |
+
PRIORITY_QUEUE_NUM = 5 # Number of priority queues
|
42 |
+
HOST_QUEUE_NUM = 1000 # Number of host queues for politeness
|
43 |
+
|
44 |
+
# Content settings
|
45 |
+
MAX_CONTENT_SIZE = 10 * 1024 * 1024 # Maximum size of HTML content to download (10MB)
|
46 |
+
ALLOWED_CONTENT_TYPES = [
|
47 |
+
"text/html",
|
48 |
+
"application/xhtml+xml",
|
49 |
+
"text/plain", # Some servers might serve HTML as text/plain
|
50 |
+
"application/html",
|
51 |
+
"*/*", # Accept any content type
|
52 |
+
] # Allowed content types
|
53 |
+
|
54 |
+
# DNS settings
|
55 |
+
DNS_CACHE_SIZE = 10000 # Maximum number of entries in DNS cache
|
56 |
+
DNS_CACHE_TIMEOUT = 3600 # DNS cache timeout in seconds
|
57 |
+
|
58 |
+
# Logging settings
|
59 |
+
LOG_LEVEL = "INFO"
|
60 |
+
LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
|
61 |
+
|
62 |
+
# Seed URLs
|
63 |
+
SEED_URLS = [
|
64 |
+
"https://en.wikipedia.org/",
|
65 |
+
"https://www.nytimes.com/",
|
66 |
+
"https://www.bbc.com/",
|
67 |
+
"https://www.github.com/",
|
68 |
+
"https://www.reddit.com/",
|
69 |
+
]
|
70 |
+
|
71 |
+
# Override settings with environment variables
|
72 |
+
def get_env_settings() -> Dict[str, Any]:
|
73 |
+
"""Get settings from environment variables"""
|
74 |
+
env_settings = {}
|
75 |
+
|
76 |
+
for key, value in globals().items():
|
77 |
+
if key.isupper(): # Only consider uppercase variables as settings
|
78 |
+
env_value = os.environ.get(f"WEBCRAWLER_{key}")
|
79 |
+
if env_value is not None:
|
80 |
+
# Convert to appropriate type based on default value
|
81 |
+
if isinstance(value, bool):
|
82 |
+
env_settings[key] = env_value.lower() in ("true", "1", "yes")
|
83 |
+
elif isinstance(value, int):
|
84 |
+
env_settings[key] = int(env_value)
|
85 |
+
elif isinstance(value, float):
|
86 |
+
env_settings[key] = float(env_value)
|
87 |
+
elif isinstance(value, list):
|
88 |
+
# Assume comma-separated values
|
89 |
+
env_settings[key] = [item.strip() for item in env_value.split(",")]
|
90 |
+
else:
|
91 |
+
env_settings[key] = env_value
|
92 |
+
|
93 |
+
return env_settings
|
94 |
+
|
95 |
+
# Update settings with environment variables
|
96 |
+
globals().update(get_env_settings())
|
crawl.py
ADDED
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Command-line interface for the web crawler.
|
4 |
+
|
5 |
+
Usage:
|
6 |
+
crawl.py start [--workers=<num>] [--async] [--seed=<url>...]
|
7 |
+
crawl.py stop
|
8 |
+
crawl.py pause
|
9 |
+
crawl.py resume
|
10 |
+
crawl.py stats
|
11 |
+
crawl.py clean [--days=<days>]
|
12 |
+
crawl.py export [--format=<format>] [--output=<file>]
|
13 |
+
crawl.py set-max-depth <depth>
|
14 |
+
crawl.py add-seed <url>...
|
15 |
+
crawl.py (-h | --help)
|
16 |
+
crawl.py --version
|
17 |
+
|
18 |
+
Options:
|
19 |
+
-h --help Show this help message
|
20 |
+
--version Show version
|
21 |
+
--workers=<num> Number of worker threads [default: 4]
|
22 |
+
--async Use asynchronous mode
|
23 |
+
--seed=<url> Seed URL(s) to start crawling
|
24 |
+
--days=<days> Days threshold for data cleaning [default: 90]
|
25 |
+
--format=<format> Export format (json, csv) [default: json]
|
26 |
+
--output=<file> Output file path [default: crawl_data.json]
|
27 |
+
"""
|
28 |
+
|
29 |
+
import os
|
30 |
+
import sys
|
31 |
+
import time
|
32 |
+
import json
|
33 |
+
import signal
|
34 |
+
import logging
|
35 |
+
import csv
|
36 |
+
from typing import List, Dict, Any
|
37 |
+
from docopt import docopt
|
38 |
+
import datetime
|
39 |
+
import traceback
|
40 |
+
|
41 |
+
from models import URL, URLStatus, Priority
|
42 |
+
from crawler import Crawler
|
43 |
+
import config
|
44 |
+
|
45 |
+
# Configure logging
|
46 |
+
logging.basicConfig(
|
47 |
+
level=getattr(logging, config.LOG_LEVEL),
|
48 |
+
format=config.LOG_FORMAT
|
49 |
+
)
|
50 |
+
logger = logging.getLogger(__name__)
|
51 |
+
|
52 |
+
# Global crawler instance
|
53 |
+
crawler = None
|
54 |
+
|
55 |
+
|
56 |
+
def initialize_crawler() -> Crawler:
|
57 |
+
"""Initialize the crawler instance"""
|
58 |
+
global crawler
|
59 |
+
if crawler is None:
|
60 |
+
crawler = Crawler()
|
61 |
+
return crawler
|
62 |
+
|
63 |
+
|
64 |
+
def start_crawler(workers: int, async_mode: bool, seed_urls: List[str]) -> None:
|
65 |
+
"""
|
66 |
+
Start the crawler
|
67 |
+
|
68 |
+
Args:
|
69 |
+
workers: Number of worker threads
|
70 |
+
async_mode: Whether to use async mode
|
71 |
+
seed_urls: List of seed URLs to add
|
72 |
+
"""
|
73 |
+
crawler = initialize_crawler()
|
74 |
+
|
75 |
+
# Add seed URLs if provided
|
76 |
+
if seed_urls:
|
77 |
+
num_added = crawler.add_seed_urls(seed_urls)
|
78 |
+
logger.info(f"Added {num_added} seed URLs")
|
79 |
+
|
80 |
+
# Start crawler
|
81 |
+
try:
|
82 |
+
crawler.start(num_workers=workers, async_mode=async_mode)
|
83 |
+
except KeyboardInterrupt:
|
84 |
+
logger.info("Crawler interrupted by user")
|
85 |
+
crawler.stop()
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Error starting crawler: {e}")
|
88 |
+
logger.error(traceback.format_exc())
|
89 |
+
crawler.stop()
|
90 |
+
|
91 |
+
|
92 |
+
def stop_crawler() -> None:
|
93 |
+
"""Stop the crawler"""
|
94 |
+
if crawler is None:
|
95 |
+
logger.error("Crawler is not running")
|
96 |
+
return
|
97 |
+
|
98 |
+
crawler.stop()
|
99 |
+
logger.info("Crawler stopped")
|
100 |
+
|
101 |
+
|
102 |
+
def pause_crawler() -> None:
|
103 |
+
"""Pause the crawler"""
|
104 |
+
if crawler is None:
|
105 |
+
logger.error("Crawler is not running")
|
106 |
+
return
|
107 |
+
|
108 |
+
crawler.pause()
|
109 |
+
logger.info("Crawler paused")
|
110 |
+
|
111 |
+
|
112 |
+
def resume_crawler() -> None:
|
113 |
+
"""Resume the crawler"""
|
114 |
+
if crawler is None:
|
115 |
+
logger.error("Crawler is not running")
|
116 |
+
return
|
117 |
+
|
118 |
+
crawler.resume()
|
119 |
+
logger.info("Crawler resumed")
|
120 |
+
|
121 |
+
|
122 |
+
def show_stats() -> None:
|
123 |
+
"""Show crawler statistics"""
|
124 |
+
if crawler is None:
|
125 |
+
logger.error("Crawler is not running")
|
126 |
+
return
|
127 |
+
|
128 |
+
# Get crawler stats
|
129 |
+
stats = crawler.stats
|
130 |
+
|
131 |
+
# Calculate elapsed time
|
132 |
+
elapsed = time.time() - stats['start_time']
|
133 |
+
elapsed_str = str(datetime.timedelta(seconds=int(elapsed)))
|
134 |
+
|
135 |
+
# Format statistics
|
136 |
+
print("\n=== Crawler Statistics ===")
|
137 |
+
print(f"Running time: {elapsed_str}")
|
138 |
+
print(f"Pages crawled: {stats['pages_crawled']}")
|
139 |
+
print(f"Pages failed: {stats['pages_failed']}")
|
140 |
+
print(f"URLs discovered: {stats['urls_discovered']}")
|
141 |
+
print(f"URLs filtered: {stats['urls_filtered']}")
|
142 |
+
|
143 |
+
# Calculate pages per second
|
144 |
+
pages_per_second = stats['pages_crawled'] / elapsed if elapsed > 0 else 0
|
145 |
+
print(f"Crawl rate: {pages_per_second:.2f} pages/second")
|
146 |
+
|
147 |
+
# Domain statistics
|
148 |
+
domains = len(stats['domains_crawled'])
|
149 |
+
print(f"Domains crawled: {domains}")
|
150 |
+
|
151 |
+
# Status code statistics
|
152 |
+
print("\n--- HTTP Status Codes ---")
|
153 |
+
for status, count in sorted(stats['status_codes'].items()):
|
154 |
+
print(f" {status}: {count}")
|
155 |
+
|
156 |
+
# Content type statistics
|
157 |
+
print("\n--- Content Types ---")
|
158 |
+
for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:10]:
|
159 |
+
print(f" {content_type}: {count}")
|
160 |
+
|
161 |
+
# Frontier size
|
162 |
+
print(f"\nFrontier size: {crawler.frontier.size()}")
|
163 |
+
|
164 |
+
# DNS cache statistics
|
165 |
+
dns_stats = crawler.dns_resolver.get_stats()
|
166 |
+
print(f"\nDNS cache: {dns_stats['hit_count']} hits, {dns_stats['miss_count']} misses, {dns_stats['size']} entries")
|
167 |
+
|
168 |
+
print("\n=========================\n")
|
169 |
+
|
170 |
+
|
171 |
+
def clean_data(days: int) -> None:
|
172 |
+
"""
|
173 |
+
Clean old data
|
174 |
+
|
175 |
+
Args:
|
176 |
+
days: Days threshold for data cleaning
|
177 |
+
"""
|
178 |
+
try:
|
179 |
+
if crawler is None:
|
180 |
+
initialize_crawler()
|
181 |
+
|
182 |
+
# Get MongoDB connection
|
183 |
+
storage = crawler.mongo_client
|
184 |
+
|
185 |
+
# Clean old pages
|
186 |
+
old_pages = storage.clean_old_pages(days)
|
187 |
+
|
188 |
+
# Clean failed URLs
|
189 |
+
failed_urls = storage.clean_failed_urls()
|
190 |
+
|
191 |
+
logger.info(f"Cleaned {old_pages} old pages and {failed_urls} failed URLs")
|
192 |
+
print(f"Cleaned {old_pages} old pages and {failed_urls} failed URLs")
|
193 |
+
except Exception as e:
|
194 |
+
logger.error(f"Error cleaning data: {e}")
|
195 |
+
print(f"Error cleaning data: {e}")
|
196 |
+
|
197 |
+
|
198 |
+
def export_data(export_format: str, output_file: str) -> None:
|
199 |
+
"""
|
200 |
+
Export crawler data
|
201 |
+
|
202 |
+
Args:
|
203 |
+
export_format: Format to export (json, csv)
|
204 |
+
output_file: Output file path
|
205 |
+
"""
|
206 |
+
try:
|
207 |
+
if crawler is None:
|
208 |
+
initialize_crawler()
|
209 |
+
|
210 |
+
# Get MongoDB connection
|
211 |
+
db = crawler.db
|
212 |
+
|
213 |
+
# Get data
|
214 |
+
pages = list(db.pages_collection.find({}, {'_id': 0}))
|
215 |
+
urls = list(db.urls_collection.find({}, {'_id': 0}))
|
216 |
+
stats = list(db.stats_collection.find({}, {'_id': 0}))
|
217 |
+
|
218 |
+
# Prepare export data
|
219 |
+
export_data = {
|
220 |
+
'metadata': {
|
221 |
+
'exported_at': datetime.datetime.now().isoformat(),
|
222 |
+
'pages_count': len(pages),
|
223 |
+
'urls_count': len(urls),
|
224 |
+
'stats_count': len(stats),
|
225 |
+
},
|
226 |
+
'pages': pages,
|
227 |
+
'urls': urls,
|
228 |
+
'stats': stats
|
229 |
+
}
|
230 |
+
|
231 |
+
# Convert datetime objects to strings
|
232 |
+
export_data = json.loads(json.dumps(export_data, default=str))
|
233 |
+
|
234 |
+
# Export based on format
|
235 |
+
if export_format.lower() == 'json':
|
236 |
+
with open(output_file, 'w') as f:
|
237 |
+
json.dump(export_data, f, indent=2)
|
238 |
+
logger.info(f"Data exported to {output_file} in JSON format")
|
239 |
+
print(f"Data exported to {output_file} in JSON format")
|
240 |
+
elif export_format.lower() == 'csv':
|
241 |
+
# Split export into multiple CSV files
|
242 |
+
base_name = os.path.splitext(output_file)[0]
|
243 |
+
|
244 |
+
# Export pages
|
245 |
+
pages_file = f"{base_name}_pages.csv"
|
246 |
+
if pages:
|
247 |
+
with open(pages_file, 'w', newline='') as f:
|
248 |
+
writer = csv.DictWriter(f, fieldnames=pages[0].keys())
|
249 |
+
writer.writeheader()
|
250 |
+
writer.writerows(pages)
|
251 |
+
|
252 |
+
# Export URLs
|
253 |
+
urls_file = f"{base_name}_urls.csv"
|
254 |
+
if urls:
|
255 |
+
with open(urls_file, 'w', newline='') as f:
|
256 |
+
writer = csv.DictWriter(f, fieldnames=urls[0].keys())
|
257 |
+
writer.writeheader()
|
258 |
+
writer.writerows(urls)
|
259 |
+
|
260 |
+
# Export stats
|
261 |
+
stats_file = f"{base_name}_stats.csv"
|
262 |
+
if stats:
|
263 |
+
with open(stats_file, 'w', newline='') as f:
|
264 |
+
writer = csv.DictWriter(f, fieldnames=stats[0].keys())
|
265 |
+
writer.writeheader()
|
266 |
+
writer.writerows(stats)
|
267 |
+
|
268 |
+
logger.info(f"Data exported to {base_name}_*.csv files in CSV format")
|
269 |
+
print(f"Data exported to {base_name}_*.csv files in CSV format")
|
270 |
+
else:
|
271 |
+
logger.error(f"Unsupported export format: {export_format}")
|
272 |
+
print(f"Unsupported export format: {export_format}")
|
273 |
+
except Exception as e:
|
274 |
+
logger.error(f"Error exporting data: {e}")
|
275 |
+
print(f"Error exporting data: {e}")
|
276 |
+
|
277 |
+
|
278 |
+
def set_max_depth(depth: int) -> None:
|
279 |
+
"""
|
280 |
+
Set maximum crawl depth
|
281 |
+
|
282 |
+
Args:
|
283 |
+
depth: Maximum crawl depth
|
284 |
+
"""
|
285 |
+
try:
|
286 |
+
depth = int(depth)
|
287 |
+
if depth < 0:
|
288 |
+
logger.error("Depth must be a positive integer")
|
289 |
+
print("Depth must be a positive integer")
|
290 |
+
return
|
291 |
+
|
292 |
+
# Update configuration
|
293 |
+
config.MAX_DEPTH = depth
|
294 |
+
|
295 |
+
logger.info(f"Maximum crawl depth set to {depth}")
|
296 |
+
print(f"Maximum crawl depth set to {depth}")
|
297 |
+
except ValueError:
|
298 |
+
logger.error("Depth must be a valid integer")
|
299 |
+
print("Depth must be a valid integer")
|
300 |
+
|
301 |
+
|
302 |
+
def add_seed_urls(urls: List[str]) -> None:
|
303 |
+
"""
|
304 |
+
Add seed URLs to the crawler
|
305 |
+
|
306 |
+
Args:
|
307 |
+
urls: List of URLs to add
|
308 |
+
"""
|
309 |
+
if crawler is None:
|
310 |
+
initialize_crawler()
|
311 |
+
|
312 |
+
num_added = crawler.add_seed_urls(urls)
|
313 |
+
logger.info(f"Added {num_added} seed URLs")
|
314 |
+
print(f"Added {num_added} seed URLs")
|
315 |
+
|
316 |
+
|
317 |
+
def handle_signal(sig, frame):
|
318 |
+
"""Handle signal interrupts"""
|
319 |
+
if sig == signal.SIGINT:
|
320 |
+
logger.info("Received SIGINT, stopping crawler")
|
321 |
+
stop_crawler()
|
322 |
+
sys.exit(0)
|
323 |
+
elif sig == signal.SIGTERM:
|
324 |
+
logger.info("Received SIGTERM, stopping crawler")
|
325 |
+
stop_crawler()
|
326 |
+
sys.exit(0)
|
327 |
+
|
328 |
+
|
329 |
+
def main():
|
330 |
+
"""Main entry point"""
|
331 |
+
# Register signal handlers
|
332 |
+
signal.signal(signal.SIGINT, handle_signal)
|
333 |
+
signal.signal(signal.SIGTERM, handle_signal)
|
334 |
+
|
335 |
+
# Parse arguments
|
336 |
+
args = docopt(__doc__, version='Web Crawler 1.0')
|
337 |
+
|
338 |
+
# Handle commands
|
339 |
+
if args['start']:
|
340 |
+
workers = int(args['--workers'])
|
341 |
+
async_mode = args['--async']
|
342 |
+
seed_urls = args['--seed'] if args['--seed'] else []
|
343 |
+
start_crawler(workers, async_mode, seed_urls)
|
344 |
+
elif args['stop']:
|
345 |
+
stop_crawler()
|
346 |
+
elif args['pause']:
|
347 |
+
pause_crawler()
|
348 |
+
elif args['resume']:
|
349 |
+
resume_crawler()
|
350 |
+
elif args['stats']:
|
351 |
+
show_stats()
|
352 |
+
elif args['clean']:
|
353 |
+
days = int(args['--days'])
|
354 |
+
clean_data(days)
|
355 |
+
elif args['export']:
|
356 |
+
export_format = args['--format']
|
357 |
+
output_file = args['--output']
|
358 |
+
export_data(export_format, output_file)
|
359 |
+
elif args['set-max-depth']:
|
360 |
+
depth = args['<depth>']
|
361 |
+
set_max_depth(depth)
|
362 |
+
elif args['add-seed']:
|
363 |
+
urls = args['<url>']
|
364 |
+
add_seed_urls(urls)
|
365 |
+
else:
|
366 |
+
print(__doc__)
|
367 |
+
|
368 |
+
|
369 |
+
if __name__ == '__main__':
|
370 |
+
main()
|
crawler.log
ADDED
File without changes
|
crawler.py
ADDED
@@ -0,0 +1,908 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main crawler class to coordinate the web crawling process
|
3 |
+
"""
|
4 |
+
|
5 |
+
import time
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
import asyncio
|
9 |
+
import threading
|
10 |
+
from typing import List, Dict, Set, Tuple, Optional, Any, Callable
|
11 |
+
from concurrent.futures import ThreadPoolExecutor
|
12 |
+
import signal
|
13 |
+
import json
|
14 |
+
from datetime import datetime
|
15 |
+
from urllib.parse import urlparse
|
16 |
+
import traceback
|
17 |
+
from pymongo import MongoClient
|
18 |
+
from prometheus_client import Counter, Gauge, Histogram, start_http_server, REGISTRY
|
19 |
+
import redis
|
20 |
+
|
21 |
+
from models import URL, Page, URLStatus, Priority
|
22 |
+
from frontier import URLFrontier
|
23 |
+
from downloader import HTMLDownloader
|
24 |
+
from parser import HTMLParser
|
25 |
+
from robots import RobotsHandler
|
26 |
+
from dns_resolver import DNSResolver
|
27 |
+
import config
|
28 |
+
from dotenv import load_dotenv, find_dotenv
|
29 |
+
|
30 |
+
load_dotenv(find_dotenv())
|
31 |
+
|
32 |
+
|
33 |
+
# Check if we're in deployment mode
|
34 |
+
IS_DEPLOYMENT = os.getenv('DEPLOYMENT', 'false').lower() == 'true'
|
35 |
+
|
36 |
+
# Import local configuration if available
|
37 |
+
try:
|
38 |
+
import local_config
|
39 |
+
# Override config settings with local settings
|
40 |
+
for key in dir(local_config):
|
41 |
+
if key.isupper():
|
42 |
+
setattr(config, key, getattr(local_config, key))
|
43 |
+
print(f"Loaded local configuration from {local_config.__file__}")
|
44 |
+
except ImportError:
|
45 |
+
pass
|
46 |
+
|
47 |
+
# Configure logging
|
48 |
+
logging.basicConfig(
|
49 |
+
level=getattr(logging, config.LOG_LEVEL),
|
50 |
+
format=config.LOG_FORMAT
|
51 |
+
)
|
52 |
+
logger = logging.getLogger(__name__)
|
53 |
+
|
54 |
+
|
55 |
+
class Crawler:
|
56 |
+
"""
|
57 |
+
Main crawler class that coordinates the web crawling process
|
58 |
+
|
59 |
+
Manages:
|
60 |
+
- URL Frontier
|
61 |
+
- HTML Downloader
|
62 |
+
- HTML Parser
|
63 |
+
- Content Storage
|
64 |
+
- Monitoring and Statistics
|
65 |
+
"""
|
66 |
+
|
67 |
+
def __init__(self,
|
68 |
+
mongo_uri: Optional[str] = None,
|
69 |
+
redis_uri: Optional[str] = None,
|
70 |
+
metrics_port: int = 9100,
|
71 |
+
storage: Optional[Any] = None):
|
72 |
+
"""
|
73 |
+
Initialize the crawler
|
74 |
+
|
75 |
+
Args:
|
76 |
+
mongo_uri: MongoDB URI for content storage
|
77 |
+
redis_uri: Redis URI for URL frontier
|
78 |
+
metrics_port: Port for Prometheus metrics server
|
79 |
+
storage: Optional storage backend for deployment mode
|
80 |
+
"""
|
81 |
+
self.storage = storage
|
82 |
+
self.metrics_port = metrics_port
|
83 |
+
|
84 |
+
# Initialize database connections only if not using custom storage
|
85 |
+
if storage is None:
|
86 |
+
self.mongo_uri = mongo_uri or config.MONGODB_URI
|
87 |
+
self.redis_uri = redis_uri or config.REDIS_URI
|
88 |
+
|
89 |
+
# Connect to MongoDB
|
90 |
+
self.mongo_client = MongoClient(self.mongo_uri)
|
91 |
+
self.db = self.mongo_client[config.MONGODB_DB]
|
92 |
+
self.pages_collection = self.db['pages']
|
93 |
+
self.urls_collection = self.db['urls']
|
94 |
+
self.stats_collection = self.db['stats']
|
95 |
+
|
96 |
+
# Ensure indexes
|
97 |
+
self._create_indexes()
|
98 |
+
|
99 |
+
# Create frontier with Redis
|
100 |
+
self.frontier = URLFrontier(redis_client=redis.from_url(self.redis_uri))
|
101 |
+
else:
|
102 |
+
# In deployment mode, use in-memory storage
|
103 |
+
self.frontier = URLFrontier(use_memory=True)
|
104 |
+
|
105 |
+
# Create other components that don't need database connections
|
106 |
+
self.robots_handler = RobotsHandler()
|
107 |
+
self.dns_resolver = DNSResolver()
|
108 |
+
self.downloader = HTMLDownloader(self.dns_resolver, self.robots_handler)
|
109 |
+
self.parser = HTMLParser()
|
110 |
+
|
111 |
+
# Initialize statistics
|
112 |
+
self.stats = {
|
113 |
+
'pages_crawled': 0,
|
114 |
+
'pages_failed': 0,
|
115 |
+
'urls_discovered': 0,
|
116 |
+
'urls_filtered': 0,
|
117 |
+
'start_time': time.time(),
|
118 |
+
'domains_crawled': set(),
|
119 |
+
'content_types': {},
|
120 |
+
'status_codes': {},
|
121 |
+
}
|
122 |
+
|
123 |
+
# Set up metrics only in local mode
|
124 |
+
if not IS_DEPLOYMENT:
|
125 |
+
self._setup_metrics()
|
126 |
+
else:
|
127 |
+
# In deployment mode, use dummy metrics that do nothing
|
128 |
+
self.pages_crawled_counter = DummyMetric()
|
129 |
+
self.pages_failed_counter = DummyMetric()
|
130 |
+
self.urls_discovered_counter = DummyMetric()
|
131 |
+
self.urls_filtered_counter = DummyMetric()
|
132 |
+
self.frontier_size_gauge = DummyMetric()
|
133 |
+
self.active_threads_gauge = DummyMetric()
|
134 |
+
self.download_time_histogram = DummyMetric()
|
135 |
+
self.page_size_histogram = DummyMetric()
|
136 |
+
|
137 |
+
# Flag to control crawling
|
138 |
+
self.running = False
|
139 |
+
self.paused = False
|
140 |
+
self.stop_event = threading.Event()
|
141 |
+
|
142 |
+
# Create storage directories if they don't exist
|
143 |
+
os.makedirs(config.HTML_STORAGE_PATH, exist_ok=True)
|
144 |
+
os.makedirs(config.LOG_PATH, exist_ok=True)
|
145 |
+
|
146 |
+
def _create_indexes(self):
|
147 |
+
"""Create indexes for MongoDB collections"""
|
148 |
+
try:
|
149 |
+
# Pages collection indexes
|
150 |
+
self.pages_collection.create_index('url', unique=True)
|
151 |
+
self.pages_collection.create_index('content_hash')
|
152 |
+
self.pages_collection.create_index('crawled_at')
|
153 |
+
|
154 |
+
# URLs collection indexes
|
155 |
+
self.urls_collection.create_index('url', unique=True)
|
156 |
+
self.urls_collection.create_index('normalized_url', unique=True)
|
157 |
+
self.urls_collection.create_index('domain')
|
158 |
+
self.urls_collection.create_index('status')
|
159 |
+
self.urls_collection.create_index('priority')
|
160 |
+
|
161 |
+
logger.info("MongoDB indexes created")
|
162 |
+
except Exception as e:
|
163 |
+
logger.error(f"Error creating MongoDB indexes: {e}")
|
164 |
+
|
165 |
+
def _setup_metrics(self):
|
166 |
+
"""Set up Prometheus metrics"""
|
167 |
+
# Clean up any existing metrics
|
168 |
+
collectors_to_remove = []
|
169 |
+
for collector in REGISTRY._collector_to_names:
|
170 |
+
for name in REGISTRY._collector_to_names[collector]:
|
171 |
+
if name.startswith('crawler_'):
|
172 |
+
collectors_to_remove.append(collector)
|
173 |
+
break
|
174 |
+
|
175 |
+
for collector in collectors_to_remove:
|
176 |
+
REGISTRY.unregister(collector)
|
177 |
+
|
178 |
+
# Counters
|
179 |
+
self.pages_crawled_counter = Counter('crawler_pages_crawled_total', 'Total pages crawled')
|
180 |
+
self.pages_failed_counter = Counter('crawler_pages_failed_total', 'Total pages failed')
|
181 |
+
self.urls_discovered_counter = Counter('crawler_urls_discovered_total', 'Total URLs discovered')
|
182 |
+
self.urls_filtered_counter = Counter('crawler_urls_filtered_total', 'Total URLs filtered')
|
183 |
+
|
184 |
+
# Gauges
|
185 |
+
self.frontier_size_gauge = Gauge('crawler_frontier_size', 'Size of URL frontier')
|
186 |
+
self.active_threads_gauge = Gauge('crawler_active_threads', 'Number of active crawler threads')
|
187 |
+
|
188 |
+
# Histograms
|
189 |
+
self.download_time_histogram = Histogram('crawler_download_time_seconds', 'Time to download pages')
|
190 |
+
self.page_size_histogram = Histogram('crawler_page_size_bytes', 'Size of downloaded pages')
|
191 |
+
|
192 |
+
# Start metrics server
|
193 |
+
try:
|
194 |
+
start_http_server(self.metrics_port)
|
195 |
+
logger.info(f"Metrics server started on port {self.metrics_port}")
|
196 |
+
except Exception as e:
|
197 |
+
logger.error(f"Error starting metrics server: {e}")
|
198 |
+
|
199 |
+
def add_seed_urls(self, urls: List[str], priority: Priority = Priority.VERY_HIGH) -> int:
|
200 |
+
"""
|
201 |
+
Add seed URLs to the frontier
|
202 |
+
|
203 |
+
Args:
|
204 |
+
urls: List of URLs to add
|
205 |
+
priority: Priority for the seed URLs
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
Number of URLs added
|
209 |
+
"""
|
210 |
+
added = 0
|
211 |
+
for url in urls:
|
212 |
+
url_obj = URL(
|
213 |
+
url=url,
|
214 |
+
status=URLStatus.PENDING,
|
215 |
+
priority=priority,
|
216 |
+
depth=0 # Seed URLs are at depth 0
|
217 |
+
)
|
218 |
+
|
219 |
+
# Save URL based on storage mode
|
220 |
+
try:
|
221 |
+
if self.storage is not None:
|
222 |
+
# Use custom storage in deployment mode
|
223 |
+
self.storage.add_url(url_obj)
|
224 |
+
else:
|
225 |
+
# Use MongoDB in local mode
|
226 |
+
self.urls_collection.update_one(
|
227 |
+
{'url': url},
|
228 |
+
{'$set': url_obj.dict()},
|
229 |
+
upsert=True
|
230 |
+
)
|
231 |
+
except Exception as e:
|
232 |
+
logger.error(f"Error saving seed URL to database: {e}")
|
233 |
+
|
234 |
+
# Add to frontier
|
235 |
+
if self.frontier.add_url(url_obj):
|
236 |
+
added += 1
|
237 |
+
self.urls_discovered_counter.inc()
|
238 |
+
logger.info(f"Added seed URL: {url}")
|
239 |
+
|
240 |
+
return added
|
241 |
+
|
242 |
+
def start(self, num_workers: int = None, async_mode: bool = False) -> None:
|
243 |
+
"""
|
244 |
+
Start the crawler
|
245 |
+
|
246 |
+
Args:
|
247 |
+
num_workers: Number of worker threads
|
248 |
+
async_mode: Whether to use async mode
|
249 |
+
"""
|
250 |
+
if self.running:
|
251 |
+
logger.warning("Crawler is already running")
|
252 |
+
return
|
253 |
+
|
254 |
+
num_workers = num_workers or config.MAX_WORKERS
|
255 |
+
|
256 |
+
# Reset stop event
|
257 |
+
self.stop_event.clear()
|
258 |
+
|
259 |
+
# Add seed URLs if frontier is empty
|
260 |
+
if self.frontier.size() == 0:
|
261 |
+
logger.info("Adding seed URLs")
|
262 |
+
self.add_seed_urls(config.SEED_URLS)
|
263 |
+
|
264 |
+
# Start crawler
|
265 |
+
self.running = True
|
266 |
+
|
267 |
+
# Register signal handlers
|
268 |
+
self._register_signal_handlers()
|
269 |
+
|
270 |
+
logger.info(f"Starting crawler with {num_workers} workers")
|
271 |
+
|
272 |
+
if async_mode:
|
273 |
+
# Use asyncio for crawler
|
274 |
+
try:
|
275 |
+
loop = asyncio.get_event_loop()
|
276 |
+
loop.run_until_complete(self._crawl_async(num_workers))
|
277 |
+
except KeyboardInterrupt:
|
278 |
+
logger.info("Crawler stopped by user")
|
279 |
+
except Exception as e:
|
280 |
+
logger.error(f"Error in async crawler: {e}")
|
281 |
+
logger.error(traceback.format_exc())
|
282 |
+
finally:
|
283 |
+
self._cleanup()
|
284 |
+
else:
|
285 |
+
# Use threads for crawler
|
286 |
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
287 |
+
try:
|
288 |
+
# Submit worker tasks
|
289 |
+
futures = [executor.submit(self._crawl_worker) for _ in range(num_workers)]
|
290 |
+
|
291 |
+
# Wait for completion
|
292 |
+
for future in futures:
|
293 |
+
future.result()
|
294 |
+
except KeyboardInterrupt:
|
295 |
+
logger.info("Crawler stopped by user")
|
296 |
+
except Exception as e:
|
297 |
+
logger.error(f"Error in threaded crawler: {e}")
|
298 |
+
logger.error(traceback.format_exc())
|
299 |
+
finally:
|
300 |
+
self._cleanup()
|
301 |
+
|
302 |
+
def _register_signal_handlers(self) -> None:
|
303 |
+
"""Register signal handlers for graceful shutdown"""
|
304 |
+
def signal_handler(sig, frame):
|
305 |
+
logger.info(f"Received signal {sig}, shutting down")
|
306 |
+
self.stop()
|
307 |
+
|
308 |
+
signal.signal(signal.SIGINT, signal_handler)
|
309 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
310 |
+
|
311 |
+
def _crawl_worker(self) -> None:
|
312 |
+
"""Worker function for threaded crawler"""
|
313 |
+
try:
|
314 |
+
self.active_threads_gauge.inc()
|
315 |
+
|
316 |
+
while self.running and not self.stop_event.is_set():
|
317 |
+
# Check if paused
|
318 |
+
if self.paused:
|
319 |
+
time.sleep(1)
|
320 |
+
continue
|
321 |
+
|
322 |
+
# Get next URL from frontier
|
323 |
+
url_obj = self.frontier.get_next_url()
|
324 |
+
|
325 |
+
# No URL available, wait and retry
|
326 |
+
if url_obj is None:
|
327 |
+
time.sleep(1)
|
328 |
+
continue
|
329 |
+
|
330 |
+
try:
|
331 |
+
# Process the URL
|
332 |
+
self._process_url(url_obj)
|
333 |
+
|
334 |
+
# Update statistics
|
335 |
+
self._update_stats()
|
336 |
+
|
337 |
+
except Exception as e:
|
338 |
+
logger.error(f"Error processing URL {url_obj.url}: {e}")
|
339 |
+
logger.error(traceback.format_exc())
|
340 |
+
|
341 |
+
# Update URL status to failed
|
342 |
+
self._mark_url_failed(url_obj, str(e))
|
343 |
+
except Exception as e:
|
344 |
+
logger.error(f"Unhandled error in worker thread: {e}")
|
345 |
+
logger.error(traceback.format_exc())
|
346 |
+
finally:
|
347 |
+
self.active_threads_gauge.dec()
|
348 |
+
|
349 |
+
async def _crawl_async(self, num_workers: int) -> None:
|
350 |
+
"""Async worker function for asyncio crawler"""
|
351 |
+
try:
|
352 |
+
self.active_threads_gauge.inc(num_workers)
|
353 |
+
|
354 |
+
# Create tasks
|
355 |
+
tasks = [self._async_worker() for _ in range(num_workers)]
|
356 |
+
|
357 |
+
# Wait for all tasks to complete
|
358 |
+
await asyncio.gather(*tasks)
|
359 |
+
|
360 |
+
except Exception as e:
|
361 |
+
logger.error(f"Unhandled error in async crawler: {e}")
|
362 |
+
logger.error(traceback.format_exc())
|
363 |
+
finally:
|
364 |
+
self.active_threads_gauge.dec(num_workers)
|
365 |
+
|
366 |
+
async def _async_worker(self) -> None:
|
367 |
+
"""Async worker function"""
|
368 |
+
try:
|
369 |
+
while self.running and not self.stop_event.is_set():
|
370 |
+
# Check if paused
|
371 |
+
if self.paused:
|
372 |
+
await asyncio.sleep(1)
|
373 |
+
continue
|
374 |
+
|
375 |
+
# Get next URL from frontier
|
376 |
+
url_obj = self.frontier.get_next_url()
|
377 |
+
|
378 |
+
# No URL available, wait and retry
|
379 |
+
if url_obj is None:
|
380 |
+
await asyncio.sleep(1)
|
381 |
+
continue
|
382 |
+
|
383 |
+
try:
|
384 |
+
# Process the URL
|
385 |
+
await self._process_url_async(url_obj)
|
386 |
+
|
387 |
+
# Update statistics
|
388 |
+
self._update_stats()
|
389 |
+
|
390 |
+
except Exception as e:
|
391 |
+
logger.error(f"Error processing URL {url_obj.url}: {e}")
|
392 |
+
logger.error(traceback.format_exc())
|
393 |
+
|
394 |
+
# Update URL status to failed
|
395 |
+
self._mark_url_failed(url_obj, str(e))
|
396 |
+
except Exception as e:
|
397 |
+
logger.error(f"Unhandled error in async worker: {e}")
|
398 |
+
logger.error(traceback.format_exc())
|
399 |
+
|
400 |
+
def _process_url(self, url_obj: URL) -> None:
|
401 |
+
"""
|
402 |
+
Process a URL
|
403 |
+
|
404 |
+
Args:
|
405 |
+
url_obj: URL object to process
|
406 |
+
"""
|
407 |
+
url = url_obj.url
|
408 |
+
logger.debug(f"Processing URL: {url}")
|
409 |
+
|
410 |
+
# Download page
|
411 |
+
with self.download_time_histogram.time():
|
412 |
+
page = self.downloader.download(url_obj)
|
413 |
+
|
414 |
+
# If download failed
|
415 |
+
if page is None:
|
416 |
+
self.pages_failed_counter.inc()
|
417 |
+
self.stats['pages_failed'] += 1
|
418 |
+
self._mark_url_failed(url_obj, url_obj.error or "Download failed")
|
419 |
+
return
|
420 |
+
|
421 |
+
# Record page size
|
422 |
+
self.page_size_histogram.observe(page.content_length)
|
423 |
+
|
424 |
+
# Check for duplicate content
|
425 |
+
content_hash = page.content_hash
|
426 |
+
duplicate = self._check_duplicate_content(content_hash, url)
|
427 |
+
|
428 |
+
if duplicate:
|
429 |
+
logger.info(f"Duplicate content detected for URL {url}")
|
430 |
+
page.is_duplicate = True
|
431 |
+
|
432 |
+
# Mark URL as duplicate but still store the page
|
433 |
+
self._mark_url_completed(url_obj)
|
434 |
+
else:
|
435 |
+
# Parse page and extract URLs
|
436 |
+
extracted_urls, metadata = self.parser.parse(page)
|
437 |
+
|
438 |
+
# Store page metadata
|
439 |
+
page.metadata = metadata
|
440 |
+
|
441 |
+
# Process extracted URLs
|
442 |
+
self._process_extracted_urls(extracted_urls, url_obj, metadata)
|
443 |
+
|
444 |
+
# Mark URL as completed
|
445 |
+
self._mark_url_completed(url_obj)
|
446 |
+
|
447 |
+
# Store page
|
448 |
+
self._store_page(page)
|
449 |
+
|
450 |
+
# Update statistics
|
451 |
+
self.pages_crawled_counter.inc()
|
452 |
+
self.stats['pages_crawled'] += 1
|
453 |
+
|
454 |
+
# Add domain to statistics
|
455 |
+
domain = url_obj.domain
|
456 |
+
self.stats['domains_crawled'].add(domain)
|
457 |
+
|
458 |
+
# Update content type statistics
|
459 |
+
content_type = page.content_type.split(';')[0].strip()
|
460 |
+
self.stats['content_types'][content_type] = self.stats['content_types'].get(content_type, 0) + 1
|
461 |
+
|
462 |
+
# Update status code statistics
|
463 |
+
status_code = page.status_code
|
464 |
+
self.stats['status_codes'][str(status_code)] = self.stats['status_codes'].get(str(status_code), 0) + 1
|
465 |
+
|
466 |
+
async def _process_url_async(self, url_obj: URL) -> None:
|
467 |
+
"""
|
468 |
+
Process a URL asynchronously
|
469 |
+
|
470 |
+
Args:
|
471 |
+
url_obj: URL object to process
|
472 |
+
"""
|
473 |
+
url = url_obj.url
|
474 |
+
logger.debug(f"Processing URL (async): {url}")
|
475 |
+
|
476 |
+
# Download page
|
477 |
+
download_start = time.time()
|
478 |
+
page = await self.downloader.download_async(url_obj)
|
479 |
+
download_time = time.time() - download_start
|
480 |
+
self.download_time_histogram.observe(download_time)
|
481 |
+
|
482 |
+
# If download failed
|
483 |
+
if page is None:
|
484 |
+
self.pages_failed_counter.inc()
|
485 |
+
self.stats['pages_failed'] += 1
|
486 |
+
self._mark_url_failed(url_obj, url_obj.error or "Download failed")
|
487 |
+
return
|
488 |
+
|
489 |
+
# Record page size
|
490 |
+
self.page_size_histogram.observe(page.content_length)
|
491 |
+
|
492 |
+
# Check for duplicate content
|
493 |
+
content_hash = page.content_hash
|
494 |
+
duplicate = self._check_duplicate_content(content_hash, url)
|
495 |
+
|
496 |
+
if duplicate:
|
497 |
+
logger.info(f"Duplicate content detected for URL {url}")
|
498 |
+
page.is_duplicate = True
|
499 |
+
|
500 |
+
# Mark URL as duplicate but still store the page
|
501 |
+
self._mark_url_completed(url_obj)
|
502 |
+
else:
|
503 |
+
# Parse page and extract URLs
|
504 |
+
extracted_urls, metadata = self.parser.parse(page)
|
505 |
+
|
506 |
+
# Store page metadata
|
507 |
+
page.metadata = metadata
|
508 |
+
|
509 |
+
# Process extracted URLs
|
510 |
+
self._process_extracted_urls(extracted_urls, url_obj, metadata)
|
511 |
+
|
512 |
+
# Mark URL as completed
|
513 |
+
self._mark_url_completed(url_obj)
|
514 |
+
|
515 |
+
# Store page
|
516 |
+
self._store_page(page)
|
517 |
+
|
518 |
+
# Update statistics
|
519 |
+
self.pages_crawled_counter.inc()
|
520 |
+
self.stats['pages_crawled'] += 1
|
521 |
+
|
522 |
+
def _check_duplicate_content(self, content_hash: str, url: str) -> bool:
|
523 |
+
"""
|
524 |
+
Check if content has been seen before
|
525 |
+
|
526 |
+
Args:
|
527 |
+
content_hash: Hash of the content
|
528 |
+
url: URL of the page
|
529 |
+
|
530 |
+
Returns:
|
531 |
+
True if content is a duplicate, False otherwise
|
532 |
+
"""
|
533 |
+
try:
|
534 |
+
if self.storage is not None:
|
535 |
+
# Use custom storage - simplified duplicate check
|
536 |
+
for page in self.storage.pages.values():
|
537 |
+
if page.content_hash == content_hash and page.url != url:
|
538 |
+
return True
|
539 |
+
return False
|
540 |
+
else:
|
541 |
+
# Use MongoDB
|
542 |
+
return self.pages_collection.find_one({
|
543 |
+
'content_hash': content_hash,
|
544 |
+
'url': {'$ne': url}
|
545 |
+
}) is not None
|
546 |
+
except Exception as e:
|
547 |
+
logger.error(f"Error checking for duplicate content: {e}")
|
548 |
+
return False
|
549 |
+
|
550 |
+
def _process_extracted_urls(self, urls: List[str], parent_url_obj: URL, metadata: Dict[str, Any]) -> None:
|
551 |
+
"""
|
552 |
+
Process extracted URLs
|
553 |
+
|
554 |
+
Args:
|
555 |
+
urls: List of URLs to process
|
556 |
+
parent_url_obj: Parent URL object
|
557 |
+
metadata: Metadata from the parent page
|
558 |
+
"""
|
559 |
+
parent_url = parent_url_obj.url
|
560 |
+
parent_depth = parent_url_obj.depth
|
561 |
+
|
562 |
+
# Check max depth
|
563 |
+
if parent_depth >= config.MAX_DEPTH:
|
564 |
+
logger.debug(f"Max depth reached for {parent_url}")
|
565 |
+
return
|
566 |
+
|
567 |
+
for url in urls:
|
568 |
+
# Calculate priority based on URL and metadata
|
569 |
+
priority = self.parser.calculate_priority(url, metadata)
|
570 |
+
|
571 |
+
# Create URL object
|
572 |
+
url_obj = URL(
|
573 |
+
url=url,
|
574 |
+
status=URLStatus.PENDING,
|
575 |
+
priority=priority,
|
576 |
+
depth=parent_depth + 1,
|
577 |
+
parent_url=parent_url
|
578 |
+
)
|
579 |
+
|
580 |
+
# Add to frontier
|
581 |
+
if self.frontier.add_url(url_obj):
|
582 |
+
# URL was added to frontier
|
583 |
+
self.urls_discovered_counter.inc()
|
584 |
+
self.stats['urls_discovered'] += 1
|
585 |
+
|
586 |
+
# Save URL based on storage mode
|
587 |
+
try:
|
588 |
+
if self.storage is not None:
|
589 |
+
# Use custom storage in deployment mode
|
590 |
+
self.storage.add_url(url_obj)
|
591 |
+
else:
|
592 |
+
# Use MongoDB in local mode
|
593 |
+
self.urls_collection.update_one(
|
594 |
+
{'url': url},
|
595 |
+
{'$set': url_obj.dict()},
|
596 |
+
upsert=True
|
597 |
+
)
|
598 |
+
except Exception as e:
|
599 |
+
logger.error(f"Error saving URL to database: {e}")
|
600 |
+
else:
|
601 |
+
# URL was not added (filtered or duplicate)
|
602 |
+
self.urls_filtered_counter.inc()
|
603 |
+
self.stats['urls_filtered'] += 1
|
604 |
+
|
605 |
+
def _mark_url_completed(self, url_obj: URL) -> None:
|
606 |
+
"""
|
607 |
+
Mark URL as completed
|
608 |
+
|
609 |
+
Args:
|
610 |
+
url_obj: URL object to mark as completed
|
611 |
+
"""
|
612 |
+
try:
|
613 |
+
url_obj.status = URLStatus.COMPLETED
|
614 |
+
url_obj.completed_at = datetime.now()
|
615 |
+
|
616 |
+
if self.storage is not None:
|
617 |
+
# Use custom storage
|
618 |
+
self.storage.add_url(url_obj)
|
619 |
+
else:
|
620 |
+
# Use MongoDB
|
621 |
+
self.urls_collection.update_one(
|
622 |
+
{'url': url_obj.url},
|
623 |
+
{'$set': url_obj.dict()},
|
624 |
+
upsert=True
|
625 |
+
)
|
626 |
+
except Exception as e:
|
627 |
+
logger.error(f"Error marking URL as completed: {e}")
|
628 |
+
|
629 |
+
def _mark_url_failed(self, url_obj: URL, error: str) -> None:
|
630 |
+
"""
|
631 |
+
Mark URL as failed
|
632 |
+
|
633 |
+
Args:
|
634 |
+
url_obj: URL object to mark as failed
|
635 |
+
error: Error message
|
636 |
+
"""
|
637 |
+
try:
|
638 |
+
url_obj.status = URLStatus.FAILED
|
639 |
+
url_obj.error = error
|
640 |
+
url_obj.completed_at = datetime.now()
|
641 |
+
|
642 |
+
if self.storage is not None:
|
643 |
+
# Use custom storage
|
644 |
+
self.storage.add_url(url_obj)
|
645 |
+
else:
|
646 |
+
# Use MongoDB
|
647 |
+
self.urls_collection.update_one(
|
648 |
+
{'url': url_obj.url},
|
649 |
+
{'$set': url_obj.dict()},
|
650 |
+
upsert=True
|
651 |
+
)
|
652 |
+
|
653 |
+
# If retries not exceeded, add back to frontier with lower priority
|
654 |
+
if url_obj.retries < config.RETRY_TIMES:
|
655 |
+
# Lower priority by one level (to a maximum of VERY_LOW)
|
656 |
+
new_priority = min(Priority.VERY_LOW, Priority(url_obj.priority + 1))
|
657 |
+
url_obj.priority = new_priority
|
658 |
+
url_obj.status = URLStatus.PENDING
|
659 |
+
|
660 |
+
# Add back to frontier
|
661 |
+
self.frontier.add_url(url_obj)
|
662 |
+
|
663 |
+
except Exception as e:
|
664 |
+
logger.error(f"Error marking URL as failed: {e}")
|
665 |
+
|
666 |
+
def _store_page(self, page: Page) -> None:
|
667 |
+
"""
|
668 |
+
Store a page in the database and optionally on disk
|
669 |
+
|
670 |
+
Args:
|
671 |
+
page: Page object to store
|
672 |
+
"""
|
673 |
+
try:
|
674 |
+
if self.storage is not None:
|
675 |
+
# Use custom storage in deployment mode
|
676 |
+
self.storage.add_page(page)
|
677 |
+
else:
|
678 |
+
# Use MongoDB in local mode
|
679 |
+
self.pages_collection.update_one(
|
680 |
+
{'url': page.url},
|
681 |
+
{'$set': page.dict()},
|
682 |
+
upsert=True
|
683 |
+
)
|
684 |
+
|
685 |
+
# Optionally store HTML content on disk
|
686 |
+
if not page.is_duplicate:
|
687 |
+
if IS_DEPLOYMENT:
|
688 |
+
# In deployment mode, store in temporary directory
|
689 |
+
domain_dir = os.path.join(config.HTML_STORAGE_PATH, self._extract_domain(page.url))
|
690 |
+
os.makedirs(domain_dir, exist_ok=True)
|
691 |
+
|
692 |
+
# Create filename from URL
|
693 |
+
filename = self._url_to_filename(page.url)
|
694 |
+
filepath = os.path.join(domain_dir, filename)
|
695 |
+
|
696 |
+
# Write HTML to file
|
697 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
698 |
+
f.write(page.content)
|
699 |
+
|
700 |
+
logger.debug(f"Stored HTML content for {page.url} at {filepath}")
|
701 |
+
else:
|
702 |
+
# In local mode, store in permanent storage
|
703 |
+
domain = self._extract_domain(page.url)
|
704 |
+
domain_dir = os.path.join(config.HTML_STORAGE_PATH, domain)
|
705 |
+
os.makedirs(domain_dir, exist_ok=True)
|
706 |
+
|
707 |
+
# Create filename from URL
|
708 |
+
filename = self._url_to_filename(page.url)
|
709 |
+
filepath = os.path.join(domain_dir, filename)
|
710 |
+
|
711 |
+
# Write HTML to file
|
712 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
713 |
+
f.write(page.content)
|
714 |
+
|
715 |
+
logger.debug(f"Stored HTML content for {page.url} at {filepath}")
|
716 |
+
except Exception as e:
|
717 |
+
logger.error(f"Error storing page: {e}")
|
718 |
+
|
719 |
+
def _extract_domain(self, url: str) -> str:
|
720 |
+
"""Extract domain from URL"""
|
721 |
+
parsed = urlparse(url)
|
722 |
+
return parsed.netloc.replace(':', '_')
|
723 |
+
|
724 |
+
def _url_to_filename(self, url: str) -> str:
|
725 |
+
"""Convert URL to filename"""
|
726 |
+
# Hash the URL to create a safe filename
|
727 |
+
url_hash = self._hash_url(url)
|
728 |
+
return f"{url_hash}.html"
|
729 |
+
|
730 |
+
def _hash_url(self, url: str) -> str:
|
731 |
+
"""Create a hash of a URL"""
|
732 |
+
import hashlib
|
733 |
+
return hashlib.md5(url.encode('utf-8')).hexdigest()
|
734 |
+
|
735 |
+
def _update_stats(self) -> None:
|
736 |
+
"""Update and log statistics"""
|
737 |
+
# Update frontier size gauge
|
738 |
+
self.frontier_size_gauge.set(self.frontier.size())
|
739 |
+
|
740 |
+
# Log statistics periodically
|
741 |
+
if self.stats['pages_crawled'] % 100 == 0:
|
742 |
+
self._log_stats()
|
743 |
+
|
744 |
+
def _log_stats(self) -> None:
|
745 |
+
"""Log crawler statistics"""
|
746 |
+
# Calculate elapsed time
|
747 |
+
elapsed = time.time() - self.stats['start_time']
|
748 |
+
hours, remainder = divmod(elapsed, 3600)
|
749 |
+
minutes, seconds = divmod(remainder, 60)
|
750 |
+
|
751 |
+
# Get current statistics
|
752 |
+
pages_crawled = self.stats['pages_crawled']
|
753 |
+
pages_failed = self.stats['pages_failed']
|
754 |
+
urls_discovered = self.stats['urls_discovered']
|
755 |
+
urls_filtered = self.stats['urls_filtered']
|
756 |
+
domains_crawled = len(self.stats['domains_crawled'])
|
757 |
+
frontier_size = self.frontier.size()
|
758 |
+
|
759 |
+
# Calculate pages per second
|
760 |
+
pages_per_second = pages_crawled / elapsed if elapsed > 0 else 0
|
761 |
+
|
762 |
+
# Log statistics
|
763 |
+
logger.info(
|
764 |
+
f"Crawler running for {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d} - "
|
765 |
+
f"Pages: {pages_crawled} ({pages_per_second:.2f}/s) - "
|
766 |
+
f"Failed: {pages_failed} - "
|
767 |
+
f"URLs Discovered: {urls_discovered} - "
|
768 |
+
f"URLs Filtered: {urls_filtered} - "
|
769 |
+
f"Domains: {domains_crawled} - "
|
770 |
+
f"Frontier: {frontier_size}"
|
771 |
+
)
|
772 |
+
|
773 |
+
# Save statistics to database
|
774 |
+
try:
|
775 |
+
stats_copy = self.stats.copy()
|
776 |
+
stats_copy['domains_crawled'] = list(stats_copy['domains_crawled'])
|
777 |
+
stats_copy['timestamp'] = datetime.datetime.now()
|
778 |
+
|
779 |
+
self.stats_collection.insert_one(stats_copy)
|
780 |
+
except Exception as e:
|
781 |
+
logger.error(f"Error saving statistics to database: {e}")
|
782 |
+
|
783 |
+
def stop(self) -> None:
|
784 |
+
"""Stop the crawler"""
|
785 |
+
if not self.running:
|
786 |
+
logger.warning("Crawler is not running")
|
787 |
+
return
|
788 |
+
|
789 |
+
logger.info("Stopping crawler")
|
790 |
+
self.stop_event.set()
|
791 |
+
self.running = False
|
792 |
+
|
793 |
+
def pause(self) -> None:
|
794 |
+
"""Pause the crawler"""
|
795 |
+
if not self.running:
|
796 |
+
logger.warning("Crawler is not running")
|
797 |
+
return
|
798 |
+
|
799 |
+
logger.info("Pausing crawler")
|
800 |
+
self.paused = True
|
801 |
+
|
802 |
+
def resume(self) -> None:
|
803 |
+
"""Resume the crawler"""
|
804 |
+
if not self.running:
|
805 |
+
logger.warning("Crawler is not running")
|
806 |
+
return
|
807 |
+
|
808 |
+
logger.info("Resuming crawler")
|
809 |
+
self.paused = False
|
810 |
+
|
811 |
+
def checkpoint(self) -> bool:
|
812 |
+
"""
|
813 |
+
Save crawler state for recovery
|
814 |
+
|
815 |
+
Returns:
|
816 |
+
True if successful, False otherwise
|
817 |
+
"""
|
818 |
+
logger.info("Creating crawler checkpoint")
|
819 |
+
|
820 |
+
# Checkpoint the frontier
|
821 |
+
frontier_checkpoint = self.frontier.checkpoint()
|
822 |
+
|
823 |
+
# Save current statistics
|
824 |
+
try:
|
825 |
+
stats_copy = self.stats.copy()
|
826 |
+
stats_copy['domains_crawled'] = list(stats_copy['domains_crawled'])
|
827 |
+
stats_copy['checkpoint_time'] = datetime.datetime.now()
|
828 |
+
|
829 |
+
with open(os.path.join(config.STORAGE_PATH, 'crawler_stats.json'), 'w') as f:
|
830 |
+
json.dump(stats_copy, f)
|
831 |
+
|
832 |
+
logger.info("Crawler checkpoint created")
|
833 |
+
return frontier_checkpoint
|
834 |
+
except Exception as e:
|
835 |
+
logger.error(f"Error creating crawler checkpoint: {e}")
|
836 |
+
return False
|
837 |
+
|
838 |
+
def restore(self) -> bool:
|
839 |
+
"""
|
840 |
+
Restore crawler state from checkpoint
|
841 |
+
|
842 |
+
Returns:
|
843 |
+
True if successful, False otherwise
|
844 |
+
"""
|
845 |
+
logger.info("Restoring crawler from checkpoint")
|
846 |
+
|
847 |
+
# Restore frontier
|
848 |
+
frontier_restored = self.frontier.restore()
|
849 |
+
|
850 |
+
# Restore statistics
|
851 |
+
try:
|
852 |
+
stats_path = os.path.join(config.STORAGE_PATH, 'crawler_stats.json')
|
853 |
+
if os.path.exists(stats_path):
|
854 |
+
with open(stats_path, 'r') as f:
|
855 |
+
saved_stats = json.load(f)
|
856 |
+
|
857 |
+
# Restore stats
|
858 |
+
self.stats = saved_stats
|
859 |
+
self.stats['domains_crawled'] = set(self.stats['domains_crawled'])
|
860 |
+
|
861 |
+
logger.info("Crawler statistics restored")
|
862 |
+
else:
|
863 |
+
logger.warning("No statistics checkpoint found")
|
864 |
+
|
865 |
+
return frontier_restored
|
866 |
+
except Exception as e:
|
867 |
+
logger.error(f"Error restoring crawler checkpoint: {e}")
|
868 |
+
return False
|
869 |
+
|
870 |
+
def _cleanup(self) -> None:
|
871 |
+
"""Clean up resources when crawler stops"""
|
872 |
+
# Create final checkpoint
|
873 |
+
self.checkpoint()
|
874 |
+
|
875 |
+
# Log final statistics
|
876 |
+
self._log_stats()
|
877 |
+
|
878 |
+
# Reset flags
|
879 |
+
self.running = False
|
880 |
+
self.paused = False
|
881 |
+
|
882 |
+
logger.info("Crawler stopped")
|
883 |
+
|
884 |
+
|
885 |
+
# Dummy metric class for deployment mode
|
886 |
+
class DummyMetric:
|
887 |
+
"""A dummy metric that does nothing"""
|
888 |
+
def inc(self, *args, **kwargs): pass
|
889 |
+
def dec(self, *args, **kwargs): pass
|
890 |
+
def set(self, *args, **kwargs): pass
|
891 |
+
def observe(self, *args, **kwargs): pass
|
892 |
+
def time(self): return self.Timer()
|
893 |
+
|
894 |
+
class Timer:
|
895 |
+
def __enter__(self): pass
|
896 |
+
def __exit__(self, exc_type, exc_val, exc_tb): pass
|
897 |
+
|
898 |
+
|
899 |
+
if __name__ == "__main__":
|
900 |
+
# Create and start crawler
|
901 |
+
crawler = Crawler()
|
902 |
+
|
903 |
+
try:
|
904 |
+
crawler.start()
|
905 |
+
except KeyboardInterrupt:
|
906 |
+
logger.info("Crawler interrupted by user")
|
907 |
+
finally:
|
908 |
+
crawler.stop()
|
deduplication.py
ADDED
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Content deduplication component for the web crawler.
|
3 |
+
|
4 |
+
Provides functionality to detect duplicate pages efficiently
|
5 |
+
|
6 |
+
1. Exact content hashing
|
7 |
+
2. Shingling and MinHash for near-duplicate detection
|
8 |
+
3. SimHash for fuzzy matching
|
9 |
+
"""
|
10 |
+
|
11 |
+
import hashlib
|
12 |
+
import logging
|
13 |
+
import time
|
14 |
+
from typing import Set, List, Dict, Tuple, Optional, Union
|
15 |
+
import random
|
16 |
+
import numpy as np
|
17 |
+
from collections import defaultdict
|
18 |
+
import re
|
19 |
+
|
20 |
+
import config
|
21 |
+
|
22 |
+
# Configure logging
|
23 |
+
logging.basicConfig(
|
24 |
+
level=getattr(logging, config.LOG_LEVEL),
|
25 |
+
format=config.LOG_FORMAT
|
26 |
+
)
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
|
30 |
+
class ContentDeduplicator:
|
31 |
+
"""
|
32 |
+
Content deduplication using multiple techniques:
|
33 |
+
- Exact match (MD5 hash)
|
34 |
+
- Near-duplicate detection (MinHash)
|
35 |
+
- Fuzzy matching (SimHash)
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(self):
|
39 |
+
"""Initialize the deduplicator"""
|
40 |
+
# Exact content hashing
|
41 |
+
self.content_hashes = set()
|
42 |
+
|
43 |
+
# MinHash parameters
|
44 |
+
self.num_hashes = 100
|
45 |
+
self.minhash_signatures = {} # URL -> MinHash signature
|
46 |
+
self.minhash_bands = defaultdict(set) # band_id -> set of URLs
|
47 |
+
self.band_size = 5 # Each band contains 5 signatures
|
48 |
+
self.shingle_size = 3 # k-shingles of 3 consecutive tokens
|
49 |
+
|
50 |
+
# SimHash parameters
|
51 |
+
self.simhash_dim = 64
|
52 |
+
self.simhash_values = {} # URL -> SimHash value
|
53 |
+
self.hamming_threshold = 3 # Maximum Hamming distance for similarity
|
54 |
+
|
55 |
+
# Cache of previously computed duplicates for quick lookups
|
56 |
+
self.duplicate_cache = {} # URL -> set of duplicate URLs
|
57 |
+
|
58 |
+
# Token preprocessing
|
59 |
+
self.token_pattern = re.compile(r'\w+')
|
60 |
+
self.stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'for', 'on', 'with'])
|
61 |
+
|
62 |
+
# Statistics
|
63 |
+
self.stats = {
|
64 |
+
'exact_duplicates': 0,
|
65 |
+
'near_duplicates': 0,
|
66 |
+
'fuzzy_duplicates': 0,
|
67 |
+
'processing_time': 0,
|
68 |
+
'total_documents': 0,
|
69 |
+
}
|
70 |
+
|
71 |
+
def is_duplicate(self, url: str, content: str) -> Tuple[bool, Optional[str]]:
|
72 |
+
"""
|
73 |
+
Check if content is a duplicate
|
74 |
+
|
75 |
+
Args:
|
76 |
+
url: URL of the page
|
77 |
+
content: Page content
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
(is_duplicate, duplicate_url): Tuple indicating if content is duplicate and what it duplicates
|
81 |
+
"""
|
82 |
+
start_time = time.time()
|
83 |
+
|
84 |
+
# Check exact match first (fastest)
|
85 |
+
content_hash = self._hash_content(content)
|
86 |
+
if content_hash in self.content_hashes:
|
87 |
+
self.stats['exact_duplicates'] += 1
|
88 |
+
processing_time = time.time() - start_time
|
89 |
+
self.stats['processing_time'] += processing_time
|
90 |
+
|
91 |
+
# Find the URL with the same hash
|
92 |
+
for existing_url, existing_hash in self._get_hash_map().items():
|
93 |
+
if existing_hash == content_hash and existing_url != url:
|
94 |
+
logger.debug(f"Exact duplicate detected: {url} duplicates {existing_url}")
|
95 |
+
return True, existing_url
|
96 |
+
|
97 |
+
return True, None
|
98 |
+
|
99 |
+
# Check cache for quick lookup
|
100 |
+
if url in self.duplicate_cache:
|
101 |
+
duplicate_url = next(iter(self.duplicate_cache[url]))
|
102 |
+
logger.debug(f"Duplicate found in cache: {url} duplicates {duplicate_url}")
|
103 |
+
return True, duplicate_url
|
104 |
+
|
105 |
+
# Only perform more expensive checks if configured to do so
|
106 |
+
if config.NEAR_DUPLICATE_DETECTION:
|
107 |
+
# Check for near-duplicates using MinHash
|
108 |
+
near_duplicate = self._check_minhash(url, content)
|
109 |
+
if near_duplicate:
|
110 |
+
self.stats['near_duplicates'] += 1
|
111 |
+
processing_time = time.time() - start_time
|
112 |
+
self.stats['processing_time'] += processing_time
|
113 |
+
|
114 |
+
logger.debug(f"Near-duplicate detected: {url} is similar to {near_duplicate}")
|
115 |
+
self._add_to_duplicate_cache(url, near_duplicate)
|
116 |
+
return True, near_duplicate
|
117 |
+
|
118 |
+
if config.FUZZY_DUPLICATE_DETECTION:
|
119 |
+
# Check for fuzzy matches using SimHash
|
120 |
+
fuzzy_duplicate = self._check_simhash(url, content)
|
121 |
+
if fuzzy_duplicate:
|
122 |
+
self.stats['fuzzy_duplicates'] += 1
|
123 |
+
processing_time = time.time() - start_time
|
124 |
+
self.stats['processing_time'] += processing_time
|
125 |
+
|
126 |
+
logger.debug(f"Fuzzy duplicate detected: {url} is similar to {fuzzy_duplicate}")
|
127 |
+
self._add_to_duplicate_cache(url, fuzzy_duplicate)
|
128 |
+
return True, fuzzy_duplicate
|
129 |
+
|
130 |
+
# Not a duplicate, add to index
|
131 |
+
self._add_to_index(url, content, content_hash)
|
132 |
+
|
133 |
+
self.stats['total_documents'] += 1
|
134 |
+
processing_time = time.time() - start_time
|
135 |
+
self.stats['processing_time'] += processing_time
|
136 |
+
|
137 |
+
return False, None
|
138 |
+
|
139 |
+
def _add_to_duplicate_cache(self, url: str, duplicate_url: str) -> None:
|
140 |
+
"""Add URL to duplicate cache for faster lookups"""
|
141 |
+
if url not in self.duplicate_cache:
|
142 |
+
self.duplicate_cache[url] = set()
|
143 |
+
self.duplicate_cache[url].add(duplicate_url)
|
144 |
+
|
145 |
+
# Also add reverse relationship
|
146 |
+
if duplicate_url not in self.duplicate_cache:
|
147 |
+
self.duplicate_cache[duplicate_url] = set()
|
148 |
+
self.duplicate_cache[duplicate_url].add(url)
|
149 |
+
|
150 |
+
def _get_hash_map(self) -> Dict[str, str]:
|
151 |
+
"""Get mapping of URLs to their content hashes"""
|
152 |
+
return {url: signature for url, signature in self.simhash_values.items()}
|
153 |
+
|
154 |
+
def _hash_content(self, content: str) -> str:
|
155 |
+
"""Create MD5 hash of content"""
|
156 |
+
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
157 |
+
|
158 |
+
def _preprocess_content(self, content: str) -> List[str]:
|
159 |
+
"""
|
160 |
+
Preprocess content for tokenization:
|
161 |
+
1. Convert to lowercase
|
162 |
+
2. Remove HTML tags
|
163 |
+
3. Extract tokens
|
164 |
+
4. Remove stop words
|
165 |
+
"""
|
166 |
+
# Remove HTML tags
|
167 |
+
content = re.sub(r'<[^>]+>', ' ', content)
|
168 |
+
|
169 |
+
# Tokenize
|
170 |
+
tokens = self.token_pattern.findall(content.lower())
|
171 |
+
|
172 |
+
# Remove stop words
|
173 |
+
tokens = [token for token in tokens if token not in self.stop_words]
|
174 |
+
|
175 |
+
return tokens
|
176 |
+
|
177 |
+
def _add_to_index(self, url: str, content: str, content_hash: Optional[str] = None) -> None:
|
178 |
+
"""
|
179 |
+
Add content to the deduplication index
|
180 |
+
|
181 |
+
Args:
|
182 |
+
url: URL of the page
|
183 |
+
content: Page content
|
184 |
+
content_hash: Optional pre-computed hash
|
185 |
+
"""
|
186 |
+
# Add exact hash
|
187 |
+
if content_hash is None:
|
188 |
+
content_hash = self._hash_content(content)
|
189 |
+
self.content_hashes.add(content_hash)
|
190 |
+
|
191 |
+
# Add MinHash signature
|
192 |
+
if config.NEAR_DUPLICATE_DETECTION:
|
193 |
+
signature = self._compute_minhash(content)
|
194 |
+
self.minhash_signatures[url] = signature
|
195 |
+
|
196 |
+
# Add to LSH bands
|
197 |
+
for i in range(0, self.num_hashes, self.band_size):
|
198 |
+
band = tuple(signature[i:i+self.band_size])
|
199 |
+
band_id = hash(band)
|
200 |
+
self.minhash_bands[band_id].add(url)
|
201 |
+
|
202 |
+
# Add SimHash
|
203 |
+
if config.FUZZY_DUPLICATE_DETECTION:
|
204 |
+
simhash_value = self._compute_simhash(content)
|
205 |
+
self.simhash_values[url] = simhash_value
|
206 |
+
|
207 |
+
def _create_shingles(self, tokens: List[str], k: int = 3) -> Set[str]:
|
208 |
+
"""
|
209 |
+
Create k-shingles from tokens
|
210 |
+
|
211 |
+
Args:
|
212 |
+
tokens: List of tokens
|
213 |
+
k: Size of shingles
|
214 |
+
|
215 |
+
Returns:
|
216 |
+
Set of shingles
|
217 |
+
"""
|
218 |
+
return set(' '.join(tokens[i:i+k]) for i in range(len(tokens) - k + 1))
|
219 |
+
|
220 |
+
def _compute_minhash(self, content: str) -> List[int]:
|
221 |
+
"""
|
222 |
+
Compute MinHash signature for content
|
223 |
+
|
224 |
+
Args:
|
225 |
+
content: Page content
|
226 |
+
|
227 |
+
Returns:
|
228 |
+
MinHash signature (list of hash values)
|
229 |
+
"""
|
230 |
+
tokens = self._preprocess_content(content)
|
231 |
+
shingles = self._create_shingles(tokens, self.shingle_size)
|
232 |
+
|
233 |
+
# Generate random hash functions
|
234 |
+
max_hash = 2**32 - 1
|
235 |
+
|
236 |
+
# Create signature
|
237 |
+
signature = [max_hash] * self.num_hashes
|
238 |
+
|
239 |
+
# For each shingle, compute hashes and keep minimum values
|
240 |
+
for shingle in shingles:
|
241 |
+
# Use shingle as seed for random hash functions
|
242 |
+
shingle_hash = hash(shingle)
|
243 |
+
|
244 |
+
for i in range(self.num_hashes):
|
245 |
+
# Simple linear hash function: (a*x + b) mod c
|
246 |
+
a = i + 1 # Different 'a' for each hash function
|
247 |
+
b = i * i # Different 'b' for each hash function
|
248 |
+
hash_value = (a * shingle_hash + b) % max_hash
|
249 |
+
|
250 |
+
# Keep the minimum hash value
|
251 |
+
signature[i] = min(signature[i], hash_value)
|
252 |
+
|
253 |
+
return signature
|
254 |
+
|
255 |
+
def _check_minhash(self, url: str, content: str) -> Optional[str]:
|
256 |
+
"""
|
257 |
+
Check for near-duplicates using MinHash and LSH
|
258 |
+
|
259 |
+
Args:
|
260 |
+
url: URL of the page
|
261 |
+
content: Page content
|
262 |
+
|
263 |
+
Returns:
|
264 |
+
URL of duplicate page if found, None otherwise
|
265 |
+
"""
|
266 |
+
# Compute MinHash signature
|
267 |
+
signature = self._compute_minhash(content)
|
268 |
+
|
269 |
+
# Check each band for potential matches
|
270 |
+
candidate_urls = set()
|
271 |
+
for i in range(0, self.num_hashes, self.band_size):
|
272 |
+
band = tuple(signature[i:i+self.band_size])
|
273 |
+
band_id = hash(band)
|
274 |
+
|
275 |
+
# Get URLs that share this band
|
276 |
+
if band_id in self.minhash_bands:
|
277 |
+
candidate_urls.update(self.minhash_bands[band_id])
|
278 |
+
|
279 |
+
# Check Jaccard similarity with candidates
|
280 |
+
for candidate_url in candidate_urls:
|
281 |
+
if candidate_url == url:
|
282 |
+
continue
|
283 |
+
|
284 |
+
candidate_signature = self.minhash_signatures[candidate_url]
|
285 |
+
similarity = self._jaccard_similarity(signature, candidate_signature)
|
286 |
+
|
287 |
+
if similarity >= config.SIMILARITY_THRESHOLD:
|
288 |
+
return candidate_url
|
289 |
+
|
290 |
+
return None
|
291 |
+
|
292 |
+
def _jaccard_similarity(self, sig1: List[int], sig2: List[int]) -> float:
|
293 |
+
"""
|
294 |
+
Estimate Jaccard similarity from MinHash signatures
|
295 |
+
|
296 |
+
Args:
|
297 |
+
sig1: First signature
|
298 |
+
sig2: Second signature
|
299 |
+
|
300 |
+
Returns:
|
301 |
+
Estimated Jaccard similarity (0-1)
|
302 |
+
"""
|
303 |
+
if len(sig1) != len(sig2):
|
304 |
+
raise ValueError("Signatures must have the same length")
|
305 |
+
|
306 |
+
# Count matching hash values
|
307 |
+
matches = sum(1 for i in range(len(sig1)) if sig1[i] == sig2[i])
|
308 |
+
|
309 |
+
# Estimate similarity
|
310 |
+
return matches / len(sig1)
|
311 |
+
|
312 |
+
def _compute_simhash(self, content: str) -> int:
|
313 |
+
"""
|
314 |
+
Compute SimHash for content
|
315 |
+
|
316 |
+
Args:
|
317 |
+
content: Page content
|
318 |
+
|
319 |
+
Returns:
|
320 |
+
SimHash value
|
321 |
+
"""
|
322 |
+
tokens = self._preprocess_content(content)
|
323 |
+
|
324 |
+
# Initialize vector
|
325 |
+
v = [0] * self.simhash_dim
|
326 |
+
|
327 |
+
# For each token, compute hash and update vector
|
328 |
+
for token in tokens:
|
329 |
+
# Compute hash of token
|
330 |
+
token_hash = hashlib.md5(token.encode('utf-8')).digest()
|
331 |
+
|
332 |
+
# Convert to binary representation
|
333 |
+
token_bits = ''.join(format(byte, '08b') for byte in token_hash)
|
334 |
+
|
335 |
+
# Use first self.simhash_dim bits
|
336 |
+
token_bits = token_bits[:self.simhash_dim]
|
337 |
+
|
338 |
+
# Update vector
|
339 |
+
for i, bit in enumerate(token_bits):
|
340 |
+
if bit == '1':
|
341 |
+
v[i] += 1
|
342 |
+
else:
|
343 |
+
v[i] -= 1
|
344 |
+
|
345 |
+
# Create fingerprint
|
346 |
+
fingerprint = 0
|
347 |
+
for i, val in enumerate(v):
|
348 |
+
if val > 0:
|
349 |
+
fingerprint |= (1 << i)
|
350 |
+
|
351 |
+
return fingerprint
|
352 |
+
|
353 |
+
def _check_simhash(self, url: str, content: str) -> Optional[str]:
|
354 |
+
"""
|
355 |
+
Check for fuzzy duplicates using SimHash
|
356 |
+
|
357 |
+
Args:
|
358 |
+
url: URL of the page
|
359 |
+
content: Page content
|
360 |
+
|
361 |
+
Returns:
|
362 |
+
URL of duplicate page if found, None otherwise
|
363 |
+
"""
|
364 |
+
# Compute SimHash
|
365 |
+
simhash_value = self._compute_simhash(content)
|
366 |
+
|
367 |
+
# Compare with existing SimHash values
|
368 |
+
for existing_url, existing_simhash in self.simhash_values.items():
|
369 |
+
if existing_url == url:
|
370 |
+
continue
|
371 |
+
|
372 |
+
# Calculate Hamming distance
|
373 |
+
hamming_distance = bin(simhash_value ^ existing_simhash).count('1')
|
374 |
+
|
375 |
+
if hamming_distance <= self.hamming_threshold:
|
376 |
+
return existing_url
|
377 |
+
|
378 |
+
return None
|
379 |
+
|
380 |
+
def clear(self) -> None:
|
381 |
+
"""Clear all indexes and caches"""
|
382 |
+
self.content_hashes.clear()
|
383 |
+
self.minhash_signatures.clear()
|
384 |
+
self.minhash_bands.clear()
|
385 |
+
self.simhash_values.clear()
|
386 |
+
self.duplicate_cache.clear()
|
387 |
+
|
388 |
+
# Reset statistics
|
389 |
+
self.stats = {
|
390 |
+
'exact_duplicates': 0,
|
391 |
+
'near_duplicates': 0,
|
392 |
+
'fuzzy_duplicates': 0,
|
393 |
+
'processing_time': 0,
|
394 |
+
'total_documents': 0,
|
395 |
+
}
|
396 |
+
|
397 |
+
def get_stats(self) -> Dict[str, Union[int, float]]:
|
398 |
+
"""Get deduplication statistics"""
|
399 |
+
stats_copy = self.stats.copy()
|
400 |
+
|
401 |
+
# Calculate average processing time
|
402 |
+
total_docs = self.stats['total_documents']
|
403 |
+
if total_docs > 0:
|
404 |
+
avg_time = self.stats['processing_time'] / total_docs
|
405 |
+
stats_copy['avg_processing_time'] = avg_time
|
406 |
+
else:
|
407 |
+
stats_copy['avg_processing_time'] = 0
|
408 |
+
|
409 |
+
# Calculate total duplicates
|
410 |
+
total_duplicates = (self.stats['exact_duplicates'] +
|
411 |
+
self.stats['near_duplicates'] +
|
412 |
+
self.stats['fuzzy_duplicates'])
|
413 |
+
stats_copy['total_duplicates'] = total_duplicates
|
414 |
+
|
415 |
+
# Calculate duplicate percentage
|
416 |
+
if total_docs > 0:
|
417 |
+
duplicate_percentage = (total_duplicates / total_docs) * 100
|
418 |
+
stats_copy['duplicate_percentage'] = duplicate_percentage
|
419 |
+
else:
|
420 |
+
stats_copy['duplicate_percentage'] = 0
|
421 |
+
|
422 |
+
return stats_copy
|
dns_resolver.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
DNS resolver with caching for web crawler
|
3 |
+
"""
|
4 |
+
|
5 |
+
import socket
|
6 |
+
import logging
|
7 |
+
import time
|
8 |
+
from typing import Dict, Optional, Tuple
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
from datetime import datetime, timedelta
|
11 |
+
from cachetools import TTLCache
|
12 |
+
import threading
|
13 |
+
import dns
|
14 |
+
import dns.resolver
|
15 |
+
|
16 |
+
import config
|
17 |
+
|
18 |
+
# Import local configuration if available
|
19 |
+
try:
|
20 |
+
import local_config
|
21 |
+
# Override config settings with local settings
|
22 |
+
for key in dir(local_config):
|
23 |
+
if key.isupper():
|
24 |
+
setattr(config, key, getattr(local_config, key))
|
25 |
+
logging.info("Loaded local configuration")
|
26 |
+
except ImportError:
|
27 |
+
pass
|
28 |
+
|
29 |
+
# Configure logging
|
30 |
+
logging.basicConfig(
|
31 |
+
level=getattr(logging, config.LOG_LEVEL),
|
32 |
+
format=config.LOG_FORMAT
|
33 |
+
)
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
|
36 |
+
|
37 |
+
class DNSResolver:
|
38 |
+
"""
|
39 |
+
DNS resolver with caching to improve performance
|
40 |
+
|
41 |
+
DNS resolution can be a bottleneck for crawlers due to the synchronous
|
42 |
+
nature of many DNS interfaces. This class provides a cached resolver
|
43 |
+
to reduce the number of DNS lookups.
|
44 |
+
"""
|
45 |
+
|
46 |
+
def __init__(self, cache_size: int = 10000, cache_ttl: int = 3600):
|
47 |
+
"""
|
48 |
+
Initialize DNS resolver
|
49 |
+
|
50 |
+
Args:
|
51 |
+
cache_size: Maximum number of DNS records to cache
|
52 |
+
cache_ttl: Time to live for cache entries in seconds
|
53 |
+
"""
|
54 |
+
self.cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
|
55 |
+
self.lock = threading.RLock() # Thread-safe operations
|
56 |
+
self.resolver = dns.resolver.Resolver()
|
57 |
+
self.resolver.timeout = 3.0 # Timeout for DNS requests in seconds
|
58 |
+
self.resolver.lifetime = 5.0 # Total timeout for all DNS requests
|
59 |
+
|
60 |
+
# Stats tracking
|
61 |
+
self.hit_count = 0
|
62 |
+
self.miss_count = 0
|
63 |
+
|
64 |
+
def resolve(self, url: str) -> Optional[str]:
|
65 |
+
"""
|
66 |
+
Resolve a URL to an IP address
|
67 |
+
|
68 |
+
Args:
|
69 |
+
url: URL to resolve
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
IP address or None if resolution fails
|
73 |
+
"""
|
74 |
+
try:
|
75 |
+
parsed = urlparse(url)
|
76 |
+
hostname = parsed.netloc.split(':')[0] # Remove port if present
|
77 |
+
|
78 |
+
# Check cache first
|
79 |
+
with self.lock:
|
80 |
+
if hostname in self.cache:
|
81 |
+
logger.debug(f"DNS cache hit for {hostname}")
|
82 |
+
self.hit_count += 1
|
83 |
+
return self.cache[hostname]
|
84 |
+
|
85 |
+
# Cache miss - resolve hostname
|
86 |
+
ip_address = self._resolve_hostname(hostname)
|
87 |
+
|
88 |
+
# Update cache
|
89 |
+
if ip_address:
|
90 |
+
with self.lock:
|
91 |
+
self.cache[hostname] = ip_address
|
92 |
+
self.miss_count += 1
|
93 |
+
|
94 |
+
return ip_address
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
logger.warning(f"Error resolving DNS for {url}: {e}")
|
98 |
+
return None
|
99 |
+
|
100 |
+
def _resolve_hostname(self, hostname: str) -> Optional[str]:
|
101 |
+
"""
|
102 |
+
Resolve hostname to IP address
|
103 |
+
|
104 |
+
Args:
|
105 |
+
hostname: Hostname to resolve
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
IP address or None if resolution fails
|
109 |
+
"""
|
110 |
+
try:
|
111 |
+
# First try using dnspython for more control
|
112 |
+
answers = self.resolver.resolve(hostname, 'A')
|
113 |
+
if answers:
|
114 |
+
# Return first IP address
|
115 |
+
return str(answers[0])
|
116 |
+
except dns.exception.DNSException as e:
|
117 |
+
logger.debug(f"dnspython DNS resolution failed for {hostname}: {e}")
|
118 |
+
|
119 |
+
# Fall back to socket.gethostbyname
|
120 |
+
try:
|
121 |
+
return socket.gethostbyname(hostname)
|
122 |
+
except socket.gaierror as e:
|
123 |
+
logger.warning(f"Socket DNS resolution failed for {hostname}: {e}")
|
124 |
+
return None
|
125 |
+
|
126 |
+
def bulk_resolve(self, urls: list) -> Dict[str, Optional[str]]:
|
127 |
+
"""
|
128 |
+
Resolve multiple URLs to IP addresses
|
129 |
+
|
130 |
+
Args:
|
131 |
+
urls: List of URLs to resolve
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
Dictionary mapping URLs to IP addresses
|
135 |
+
"""
|
136 |
+
results = {}
|
137 |
+
for url in urls:
|
138 |
+
results[url] = self.resolve(url)
|
139 |
+
return results
|
140 |
+
|
141 |
+
def clear_cache(self) -> None:
|
142 |
+
"""Clear the DNS cache"""
|
143 |
+
with self.lock:
|
144 |
+
self.cache.clear()
|
145 |
+
|
146 |
+
def get_stats(self) -> Dict[str, int]:
|
147 |
+
"""
|
148 |
+
Get statistics about the DNS cache
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
Dictionary with cache statistics
|
152 |
+
"""
|
153 |
+
with self.lock:
|
154 |
+
return {
|
155 |
+
'size': len(self.cache),
|
156 |
+
'max_size': self.cache.maxsize,
|
157 |
+
'ttl': self.cache.ttl,
|
158 |
+
'hit_count': self.hit_count,
|
159 |
+
'miss_count': self.miss_count,
|
160 |
+
'hit_ratio': self.hit_count / (self.hit_count + self.miss_count) if (self.hit_count + self.miss_count) > 0 else 0
|
161 |
+
}
|
docker-compose.yml
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3'
|
2 |
+
|
3 |
+
services:
|
4 |
+
mongodb:
|
5 |
+
image: mongo:6.0
|
6 |
+
container_name: crawler-mongodb
|
7 |
+
ports:
|
8 |
+
- "27017:27017"
|
9 |
+
volumes:
|
10 |
+
- mongodb_data:/data/db
|
11 |
+
restart: unless-stopped
|
12 |
+
environment:
|
13 |
+
- MONGO_INITDB_DATABASE=webcrawler
|
14 |
+
networks:
|
15 |
+
- crawler-network
|
16 |
+
|
17 |
+
redis:
|
18 |
+
image: redis:latest
|
19 |
+
container_name: crawler-redis
|
20 |
+
ports:
|
21 |
+
- "6379:6379"
|
22 |
+
volumes:
|
23 |
+
- redis_data:/data
|
24 |
+
restart: unless-stopped
|
25 |
+
networks:
|
26 |
+
- crawler-network
|
27 |
+
|
28 |
+
web-crawler:
|
29 |
+
build:
|
30 |
+
context: .
|
31 |
+
dockerfile: Dockerfile
|
32 |
+
container_name: web-crawler
|
33 |
+
volumes:
|
34 |
+
- ./:/app
|
35 |
+
- crawler_data:/data/storage
|
36 |
+
ports:
|
37 |
+
- "9100:9100"
|
38 |
+
depends_on:
|
39 |
+
- mongodb
|
40 |
+
- redis
|
41 |
+
environment:
|
42 |
+
- MONGODB_URI=mongodb://mongodb:27017/
|
43 |
+
- REDIS_URI=redis://redis:6379/0
|
44 |
+
- LOG_LEVEL=INFO
|
45 |
+
- MAX_WORKERS=4
|
46 |
+
networks:
|
47 |
+
- crawler-network
|
48 |
+
command: python crawl.py start --workers=4
|
49 |
+
|
50 |
+
crawler-api:
|
51 |
+
build:
|
52 |
+
context: .
|
53 |
+
dockerfile: Dockerfile
|
54 |
+
container_name: crawler-api
|
55 |
+
volumes:
|
56 |
+
- ./:/app
|
57 |
+
- crawler_data:/data/storage
|
58 |
+
ports:
|
59 |
+
- "8000:8000"
|
60 |
+
depends_on:
|
61 |
+
- mongodb
|
62 |
+
- redis
|
63 |
+
- web-crawler
|
64 |
+
environment:
|
65 |
+
- MONGODB_URI=mongodb://mongodb:27017/
|
66 |
+
- REDIS_URI=redis://redis:6379/0
|
67 |
+
- LOG_LEVEL=INFO
|
68 |
+
networks:
|
69 |
+
- crawler-network
|
70 |
+
command: python api.py
|
71 |
+
|
72 |
+
networks:
|
73 |
+
crawler-network:
|
74 |
+
driver: bridge
|
75 |
+
|
76 |
+
volumes:
|
77 |
+
mongodb_data:
|
78 |
+
redis_data:
|
79 |
+
crawler_data:
|
downloader.py
ADDED
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
HTML Downloader component for web crawler
|
3 |
+
"""
|
4 |
+
|
5 |
+
import time
|
6 |
+
import logging
|
7 |
+
import requests
|
8 |
+
from requests.exceptions import RequestException
|
9 |
+
from typing import Dict, Optional, Tuple, List, Any
|
10 |
+
from urllib.parse import urlparse
|
11 |
+
import aiohttp
|
12 |
+
import asyncio
|
13 |
+
from aiohttp.client_exceptions import ClientError
|
14 |
+
import hashlib
|
15 |
+
import os
|
16 |
+
|
17 |
+
from models import URL, Page, calculate_content_hash
|
18 |
+
from dns_resolver import DNSResolver
|
19 |
+
from robots import RobotsHandler
|
20 |
+
import config
|
21 |
+
|
22 |
+
# Configure logging
|
23 |
+
logging.basicConfig(
|
24 |
+
level=getattr(logging, config.LOG_LEVEL),
|
25 |
+
format=config.LOG_FORMAT
|
26 |
+
)
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
|
30 |
+
class HTMLDownloader:
|
31 |
+
"""
|
32 |
+
HTML Downloader responsible for downloading web pages
|
33 |
+
|
34 |
+
Features:
|
35 |
+
- Respects robots.txt rules
|
36 |
+
- Uses DNS caching for performance
|
37 |
+
- Handles errors and retries
|
38 |
+
- Supports both synchronous and asynchronous downloads
|
39 |
+
"""
|
40 |
+
|
41 |
+
def __init__(self,
|
42 |
+
dns_resolver: Optional[DNSResolver] = None,
|
43 |
+
robots_handler: Optional[RobotsHandler] = None,
|
44 |
+
user_agent: Optional[str] = None):
|
45 |
+
"""
|
46 |
+
Initialize HTML Downloader
|
47 |
+
|
48 |
+
Args:
|
49 |
+
dns_resolver: DNS resolver for hostname resolution
|
50 |
+
robots_handler: Handler for robots.txt
|
51 |
+
user_agent: User agent to use for requests
|
52 |
+
"""
|
53 |
+
self.dns_resolver = dns_resolver or DNSResolver()
|
54 |
+
self.robots_handler = robots_handler or RobotsHandler()
|
55 |
+
self.user_agent = user_agent or config.USER_AGENT
|
56 |
+
|
57 |
+
# Create request session
|
58 |
+
self.session = requests.Session()
|
59 |
+
self.session.headers.update({
|
60 |
+
'User-Agent': self.user_agent,
|
61 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
|
62 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
63 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
64 |
+
'Connection': 'keep-alive',
|
65 |
+
'Upgrade-Insecure-Requests': '1',
|
66 |
+
'Cache-Control': 'max-age=0'
|
67 |
+
})
|
68 |
+
|
69 |
+
def download(self, url_obj: URL) -> Optional[Page]:
|
70 |
+
"""
|
71 |
+
Download an HTML page from a URL
|
72 |
+
|
73 |
+
Args:
|
74 |
+
url_obj: URL object to download
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Page object or None if download fails
|
78 |
+
"""
|
79 |
+
url = url_obj.url
|
80 |
+
try:
|
81 |
+
# Check robots.txt first
|
82 |
+
if config.ROBOTSTXT_OBEY:
|
83 |
+
allowed, crawl_delay = self.robots_handler.can_fetch(url)
|
84 |
+
if not allowed:
|
85 |
+
logger.info(f"URL not allowed by robots.txt: {url}")
|
86 |
+
url_obj.status = "robotstxt_excluded"
|
87 |
+
return None
|
88 |
+
|
89 |
+
# Respect crawl delay if specified
|
90 |
+
if crawl_delay and crawl_delay > 0:
|
91 |
+
time.sleep(crawl_delay)
|
92 |
+
|
93 |
+
# Resolve DNS
|
94 |
+
ip_address = self.dns_resolver.resolve(url)
|
95 |
+
if not ip_address:
|
96 |
+
logger.warning(f"Failed to resolve DNS for URL: {url}")
|
97 |
+
url_obj.error = "DNS resolution failed"
|
98 |
+
return None
|
99 |
+
|
100 |
+
# Download page with specific headers
|
101 |
+
start_time = time.time()
|
102 |
+
response = self.session.get(
|
103 |
+
url,
|
104 |
+
timeout=config.CRAWL_TIMEOUT,
|
105 |
+
allow_redirects=True,
|
106 |
+
stream=True, # Stream to avoid downloading large files fully
|
107 |
+
headers={
|
108 |
+
'User-Agent': self.user_agent,
|
109 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
|
110 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
111 |
+
'Accept-Encoding': 'gzip', # Only accept gzip to avoid encoding issues
|
112 |
+
'Connection': 'keep-alive'
|
113 |
+
}
|
114 |
+
)
|
115 |
+
|
116 |
+
# Log response details
|
117 |
+
logger.debug(f"Response status code: {response.status_code}")
|
118 |
+
logger.debug(f"Response headers: {dict(response.headers)}")
|
119 |
+
|
120 |
+
# Check content type
|
121 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
122 |
+
logger.debug(f"Content type for {url}: {content_type}")
|
123 |
+
|
124 |
+
is_html = any(allowed_type in content_type for allowed_type in config.ALLOWED_CONTENT_TYPES) or \
|
125 |
+
any(allowed_type == '*/*' for allowed_type in config.ALLOWED_CONTENT_TYPES)
|
126 |
+
|
127 |
+
if not is_html:
|
128 |
+
logger.info(f"Skipping non-HTML content ({content_type}): {url}")
|
129 |
+
url_obj.error = f"Non-HTML content type: {content_type}"
|
130 |
+
return None
|
131 |
+
|
132 |
+
# Read content (with size limit)
|
133 |
+
content = b""
|
134 |
+
for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
|
135 |
+
content += chunk
|
136 |
+
if len(content) > config.MAX_CONTENT_SIZE:
|
137 |
+
logger.info(f"Content exceeded max size during download: {url}")
|
138 |
+
url_obj.error = f"Content exceeded max size: {len(content)} bytes"
|
139 |
+
return None
|
140 |
+
|
141 |
+
# Log content details
|
142 |
+
logger.debug(f"Downloaded content size: {len(content)} bytes")
|
143 |
+
logger.debug(f"First 100 bytes (hex): {content[:100].hex()}")
|
144 |
+
|
145 |
+
# Check for UTF-8 BOM
|
146 |
+
if content.startswith(b'\xef\xbb\xbf'):
|
147 |
+
content = content[3:]
|
148 |
+
logger.debug("Removed UTF-8 BOM from content")
|
149 |
+
|
150 |
+
# Try to detect encoding from response headers
|
151 |
+
encoding = None
|
152 |
+
if 'charset=' in content_type:
|
153 |
+
encoding = content_type.split('charset=')[-1].strip()
|
154 |
+
logger.debug(f"Found encoding in Content-Type header: {encoding}")
|
155 |
+
|
156 |
+
# Try to detect encoding from content
|
157 |
+
try:
|
158 |
+
import chardet
|
159 |
+
detected = chardet.detect(content)
|
160 |
+
if detected['confidence'] > 0.8: # Only use if confidence is high
|
161 |
+
encoding = detected['encoding']
|
162 |
+
logger.debug(f"Detected encoding using chardet: {encoding} (confidence: {detected['confidence']})")
|
163 |
+
except ImportError:
|
164 |
+
logger.debug("chardet not available for encoding detection")
|
165 |
+
|
166 |
+
# Decode content with fallbacks
|
167 |
+
html_content = None
|
168 |
+
encodings_to_try = [
|
169 |
+
encoding,
|
170 |
+
'utf-8',
|
171 |
+
'utf-8-sig',
|
172 |
+
'iso-8859-1',
|
173 |
+
'cp1252',
|
174 |
+
'ascii'
|
175 |
+
]
|
176 |
+
|
177 |
+
for enc in encodings_to_try:
|
178 |
+
if not enc:
|
179 |
+
continue
|
180 |
+
try:
|
181 |
+
html_content = content.decode(enc)
|
182 |
+
# Quick validation of HTML content
|
183 |
+
if '<!DOCTYPE' in html_content[:1000] or '<html' in html_content[:1000]:
|
184 |
+
logger.debug(f"Successfully decoded content using {enc} encoding")
|
185 |
+
break
|
186 |
+
else:
|
187 |
+
logger.debug(f"Decoded with {enc} but content doesn't look like HTML")
|
188 |
+
html_content = None
|
189 |
+
except UnicodeDecodeError:
|
190 |
+
logger.debug(f"Failed to decode content using {enc} encoding")
|
191 |
+
continue
|
192 |
+
|
193 |
+
if html_content is None:
|
194 |
+
logger.warning(f"Failed to decode content for URL: {url} with any encoding")
|
195 |
+
url_obj.error = "Failed to decode content"
|
196 |
+
return None
|
197 |
+
|
198 |
+
# Additional HTML validation
|
199 |
+
if not any(marker in html_content[:1000] for marker in ['<!DOCTYPE', '<html', '<head', '<body']):
|
200 |
+
logger.warning(f"Content doesn't appear to be valid HTML for URL: {url}")
|
201 |
+
url_obj.error = "Invalid HTML content"
|
202 |
+
return None
|
203 |
+
|
204 |
+
# Calculate hash for duplicate detection
|
205 |
+
content_hash = calculate_content_hash(html_content)
|
206 |
+
|
207 |
+
elapsed_time = time.time() - start_time
|
208 |
+
|
209 |
+
# Create page object
|
210 |
+
page = Page(
|
211 |
+
url=url,
|
212 |
+
status_code=response.status_code,
|
213 |
+
content=html_content,
|
214 |
+
content_type=content_type,
|
215 |
+
content_length=len(content),
|
216 |
+
content_hash=content_hash,
|
217 |
+
headers={k.lower(): v for k, v in response.headers.items()},
|
218 |
+
crawled_at=time.time(),
|
219 |
+
redirect_url=response.url if response.url != url else None,
|
220 |
+
elapsed_time=elapsed_time
|
221 |
+
)
|
222 |
+
|
223 |
+
logger.info(f"Downloaded {len(content)} bytes from {url} in {elapsed_time:.2f}s")
|
224 |
+
return page
|
225 |
+
|
226 |
+
except RequestException as e:
|
227 |
+
logger.warning(f"Request error for URL {url}: {e}")
|
228 |
+
url_obj.error = f"Request error: {str(e)}"
|
229 |
+
return None
|
230 |
+
|
231 |
+
except Exception as e:
|
232 |
+
logger.error(f"Unexpected error downloading URL {url}: {e}")
|
233 |
+
url_obj.error = f"Unexpected error: {str(e)}"
|
234 |
+
return None
|
235 |
+
|
236 |
+
async def download_async(self, url_obj: URL, session: Optional[aiohttp.ClientSession] = None) -> Optional[Page]:
|
237 |
+
"""
|
238 |
+
Download an HTML page asynchronously
|
239 |
+
|
240 |
+
Args:
|
241 |
+
url_obj: URL object to download
|
242 |
+
session: Optional aiohttp session to use
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
Page object or None if download fails
|
246 |
+
"""
|
247 |
+
url = url_obj.url
|
248 |
+
own_session = False
|
249 |
+
|
250 |
+
try:
|
251 |
+
# Check robots.txt first (blocking call)
|
252 |
+
if config.ROBOTSTXT_OBEY:
|
253 |
+
allowed, crawl_delay = self.robots_handler.can_fetch(url)
|
254 |
+
if not allowed:
|
255 |
+
logger.info(f"URL not allowed by robots.txt: {url}")
|
256 |
+
url_obj.status = "robotstxt_excluded"
|
257 |
+
return None
|
258 |
+
|
259 |
+
# Respect crawl delay if specified
|
260 |
+
if crawl_delay and crawl_delay > 0:
|
261 |
+
await asyncio.sleep(crawl_delay)
|
262 |
+
|
263 |
+
# Resolve DNS (blocking call, but cached)
|
264 |
+
ip_address = self.dns_resolver.resolve(url)
|
265 |
+
if not ip_address:
|
266 |
+
logger.warning(f"Failed to resolve DNS for URL: {url}")
|
267 |
+
url_obj.error = "DNS resolution failed"
|
268 |
+
return None
|
269 |
+
|
270 |
+
# Create session if not provided
|
271 |
+
if session is None:
|
272 |
+
own_session = True
|
273 |
+
session = aiohttp.ClientSession(headers={
|
274 |
+
'User-Agent': self.user_agent,
|
275 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
|
276 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
277 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
278 |
+
'Connection': 'keep-alive',
|
279 |
+
'Upgrade-Insecure-Requests': '1',
|
280 |
+
'Cache-Control': 'max-age=0'
|
281 |
+
})
|
282 |
+
|
283 |
+
# Download page
|
284 |
+
start_time = time.time()
|
285 |
+
async with session.get(url, timeout=config.CRAWL_TIMEOUT, allow_redirects=True) as response:
|
286 |
+
# Check content type
|
287 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
288 |
+
is_html = any(allowed_type in content_type for allowed_type in config.ALLOWED_CONTENT_TYPES)
|
289 |
+
|
290 |
+
if not is_html:
|
291 |
+
logger.info(f"Skipping non-HTML content ({content_type}): {url}")
|
292 |
+
url_obj.error = f"Non-HTML content type: {content_type}"
|
293 |
+
return None
|
294 |
+
|
295 |
+
# Check content length
|
296 |
+
content_length = int(response.headers.get('Content-Length', 0))
|
297 |
+
if content_length > config.MAX_CONTENT_SIZE:
|
298 |
+
logger.info(f"Skipping large content ({content_length} bytes): {url}")
|
299 |
+
url_obj.error = f"Content too large: {content_length} bytes"
|
300 |
+
return None
|
301 |
+
|
302 |
+
# Read content (with size limit)
|
303 |
+
content = b""
|
304 |
+
async for chunk in response.content.iter_chunked(1024*1024): # 1MB chunks
|
305 |
+
content += chunk
|
306 |
+
if len(content) > config.MAX_CONTENT_SIZE:
|
307 |
+
logger.info(f"Content exceeded max size during download: {url}")
|
308 |
+
url_obj.error = f"Content exceeded max size: {len(content)} bytes"
|
309 |
+
return None
|
310 |
+
|
311 |
+
# Decode content
|
312 |
+
try:
|
313 |
+
html_content = content.decode('utf-8')
|
314 |
+
except UnicodeDecodeError:
|
315 |
+
try:
|
316 |
+
# Try with a more forgiving encoding
|
317 |
+
html_content = content.decode('iso-8859-1')
|
318 |
+
except UnicodeDecodeError:
|
319 |
+
logger.warning(f"Failed to decode content for URL: {url}")
|
320 |
+
url_obj.error = "Failed to decode content"
|
321 |
+
return None
|
322 |
+
|
323 |
+
# Calculate hash for duplicate detection
|
324 |
+
content_hash = calculate_content_hash(html_content)
|
325 |
+
|
326 |
+
elapsed_time = time.time() - start_time
|
327 |
+
|
328 |
+
# Create page object
|
329 |
+
page = Page(
|
330 |
+
url=url,
|
331 |
+
status_code=response.status,
|
332 |
+
content=html_content,
|
333 |
+
content_type=content_type,
|
334 |
+
content_length=len(content),
|
335 |
+
content_hash=content_hash,
|
336 |
+
headers={k.lower(): v for k, v in response.headers.items()},
|
337 |
+
crawled_at=time.time(),
|
338 |
+
redirect_url=str(response.url) if str(response.url) != url else None,
|
339 |
+
elapsed_time=elapsed_time
|
340 |
+
)
|
341 |
+
|
342 |
+
logger.info(f"Downloaded {len(content)} bytes from {url} in {elapsed_time:.2f}s")
|
343 |
+
return page
|
344 |
+
|
345 |
+
except (ClientError, asyncio.TimeoutError) as e:
|
346 |
+
logger.warning(f"Request error for URL {url}: {e}")
|
347 |
+
url_obj.error = f"Request error: {str(e)}"
|
348 |
+
return None
|
349 |
+
|
350 |
+
except Exception as e:
|
351 |
+
logger.error(f"Unexpected error downloading URL {url}: {e}")
|
352 |
+
url_obj.error = f"Unexpected error: {str(e)}"
|
353 |
+
return None
|
354 |
+
|
355 |
+
finally:
|
356 |
+
# Close session if we created it
|
357 |
+
if own_session and session:
|
358 |
+
await session.close()
|
359 |
+
|
360 |
+
async def bulk_download(self, urls: List[URL], concurrency: int = 10) -> Dict[str, Optional[Page]]:
|
361 |
+
"""
|
362 |
+
Download multiple URLs concurrently
|
363 |
+
|
364 |
+
Args:
|
365 |
+
urls: List of URL objects to download
|
366 |
+
concurrency: Maximum number of concurrent downloads
|
367 |
+
|
368 |
+
Returns:
|
369 |
+
Dictionary mapping URL strings to Page objects
|
370 |
+
"""
|
371 |
+
results = {}
|
372 |
+
|
373 |
+
# Create a session to be shared across requests
|
374 |
+
async with aiohttp.ClientSession(headers={
|
375 |
+
'User-Agent': self.user_agent,
|
376 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
|
377 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
378 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
379 |
+
'Connection': 'keep-alive',
|
380 |
+
'Upgrade-Insecure-Requests': '1',
|
381 |
+
'Cache-Control': 'max-age=0'
|
382 |
+
}) as session:
|
383 |
+
# Create a semaphore to limit concurrency
|
384 |
+
semaphore = asyncio.Semaphore(concurrency)
|
385 |
+
|
386 |
+
async def download_with_semaphore(url_obj):
|
387 |
+
async with semaphore:
|
388 |
+
return await self.download_async(url_obj, session)
|
389 |
+
|
390 |
+
# Create download tasks
|
391 |
+
tasks = [download_with_semaphore(url_obj) for url_obj in urls]
|
392 |
+
|
393 |
+
# Wait for all tasks to complete
|
394 |
+
pages = await asyncio.gather(*tasks)
|
395 |
+
|
396 |
+
# Map results
|
397 |
+
for url_obj, page in zip(urls, pages):
|
398 |
+
results[url_obj.url] = page
|
399 |
+
|
400 |
+
return results
|
example.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Example script that demonstrates how to use the web crawler programmatically.
|
4 |
+
|
5 |
+
This example:
|
6 |
+
1. Initializes the crawler
|
7 |
+
2. Adds seed URLs
|
8 |
+
3. Starts the crawler with 2 workers
|
9 |
+
4. Monitors progress for a specific duration
|
10 |
+
5. Pauses, resumes, and stops the crawler
|
11 |
+
6. Exports crawl data
|
12 |
+
|
13 |
+
Usage:
|
14 |
+
python example.py [--time=<seconds>] [--workers=<num>] [--async]
|
15 |
+
|
16 |
+
Options:
|
17 |
+
--time=<seconds> Duration of the crawl in seconds [default: 60]
|
18 |
+
--workers=<num> Number of worker threads [default: 2]
|
19 |
+
--async Use asynchronous mode
|
20 |
+
"""
|
21 |
+
|
22 |
+
import time
|
23 |
+
import logging
|
24 |
+
import sys
|
25 |
+
import json
|
26 |
+
import os
|
27 |
+
import signal
|
28 |
+
import threading
|
29 |
+
from docopt import docopt
|
30 |
+
|
31 |
+
from crawler import Crawler
|
32 |
+
from models import URLStatus, Priority
|
33 |
+
import config
|
34 |
+
|
35 |
+
# Configure logging
|
36 |
+
logging.basicConfig(
|
37 |
+
level=logging.INFO,
|
38 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
39 |
+
)
|
40 |
+
logger = logging.getLogger('example')
|
41 |
+
|
42 |
+
|
43 |
+
def log_stats(crawler, interval=5):
|
44 |
+
"""Log crawler statistics periodically"""
|
45 |
+
stats = crawler.stats
|
46 |
+
elapsed = time.time() - stats['start_time']
|
47 |
+
|
48 |
+
logger.info(f"=== Crawler Statistics (after {int(elapsed)}s) ===")
|
49 |
+
logger.info(f"Pages crawled: {stats['pages_crawled']}")
|
50 |
+
logger.info(f"Pages failed: {stats['pages_failed']}")
|
51 |
+
logger.info(f"URLs discovered: {stats['urls_discovered']}")
|
52 |
+
logger.info(f"URLs filtered: {stats['urls_filtered']}")
|
53 |
+
logger.info(f"Domains crawled: {len(stats['domains_crawled'])}")
|
54 |
+
logger.info(f"Frontier size: {crawler.frontier.size()}")
|
55 |
+
|
56 |
+
# Status code distribution
|
57 |
+
status_codes = stats['status_codes']
|
58 |
+
if status_codes:
|
59 |
+
logger.info("Status code distribution:")
|
60 |
+
for status, count in sorted(status_codes.items()):
|
61 |
+
logger.info(f" {status}: {count}")
|
62 |
+
|
63 |
+
# Check if crawler is still running
|
64 |
+
if crawler.running and not crawler.stop_event.is_set():
|
65 |
+
# Schedule next logging
|
66 |
+
timer = threading.Timer(interval, log_stats, args=[crawler, interval])
|
67 |
+
timer.daemon = True
|
68 |
+
timer.start()
|
69 |
+
|
70 |
+
|
71 |
+
def example_crawl(duration=60, workers=2, async_mode=False):
|
72 |
+
"""
|
73 |
+
Example crawler use
|
74 |
+
|
75 |
+
Args:
|
76 |
+
duration: Duration of the crawl in seconds
|
77 |
+
workers: Number of worker threads
|
78 |
+
async_mode: Whether to use async mode
|
79 |
+
"""
|
80 |
+
logger.info("Initializing web crawler...")
|
81 |
+
|
82 |
+
# Initialize crawler
|
83 |
+
crawler = Crawler()
|
84 |
+
|
85 |
+
# Add seed URLs
|
86 |
+
seed_urls = [
|
87 |
+
'https://en.wikipedia.org/wiki/Web_crawler',
|
88 |
+
'https://en.wikipedia.org/wiki/Search_engine',
|
89 |
+
'https://en.wikipedia.org/wiki/Web_indexing',
|
90 |
+
'https://python.org',
|
91 |
+
'https://www.example.com'
|
92 |
+
]
|
93 |
+
logger.info(f"Adding {len(seed_urls)} seed URLs...")
|
94 |
+
crawler.add_seed_urls(seed_urls)
|
95 |
+
|
96 |
+
# Set up signal handling
|
97 |
+
def signal_handler(sig, frame):
|
98 |
+
logger.info("Received interrupt signal, stopping crawler")
|
99 |
+
crawler.stop()
|
100 |
+
sys.exit(0)
|
101 |
+
|
102 |
+
signal.signal(signal.SIGINT, signal_handler)
|
103 |
+
|
104 |
+
# Start a thread to log stats periodically
|
105 |
+
log_stats(crawler, interval=5)
|
106 |
+
|
107 |
+
# Start the crawler in a separate thread
|
108 |
+
logger.info(f"Starting crawler with {workers} workers (async={async_mode})...")
|
109 |
+
crawler_thread = threading.Thread(
|
110 |
+
target=crawler.start,
|
111 |
+
kwargs={'num_workers': workers, 'async_mode': async_mode}
|
112 |
+
)
|
113 |
+
crawler_thread.daemon = True
|
114 |
+
crawler_thread.start()
|
115 |
+
|
116 |
+
# Let the crawler run for a while
|
117 |
+
logger.info(f"Crawler will run for {duration} seconds...")
|
118 |
+
time.sleep(duration // 2)
|
119 |
+
|
120 |
+
# Pause crawler
|
121 |
+
logger.info("Pausing crawler for 5 seconds...")
|
122 |
+
crawler.pause()
|
123 |
+
time.sleep(5)
|
124 |
+
|
125 |
+
# Resume crawler
|
126 |
+
logger.info("Resuming crawler...")
|
127 |
+
crawler.resume()
|
128 |
+
time.sleep(duration // 2)
|
129 |
+
|
130 |
+
# Stop crawler
|
131 |
+
logger.info("Stopping crawler...")
|
132 |
+
crawler.stop()
|
133 |
+
|
134 |
+
# Wait for crawler to stop
|
135 |
+
crawler_thread.join(timeout=10)
|
136 |
+
|
137 |
+
# Export crawl data
|
138 |
+
export_dir = os.path.join(config.STORAGE_PATH, 'exports')
|
139 |
+
os.makedirs(export_dir, exist_ok=True)
|
140 |
+
export_file = os.path.join(export_dir, 'example_crawl_results.json')
|
141 |
+
|
142 |
+
logger.info(f"Exporting crawl data to {export_file}...")
|
143 |
+
export_results(crawler, export_file)
|
144 |
+
|
145 |
+
logger.info("Crawl example completed")
|
146 |
+
|
147 |
+
# Print summary
|
148 |
+
print_summary(crawler)
|
149 |
+
|
150 |
+
|
151 |
+
def export_results(crawler, output_file):
|
152 |
+
"""
|
153 |
+
Export crawler results to a file
|
154 |
+
|
155 |
+
Args:
|
156 |
+
crawler: Crawler instance
|
157 |
+
output_file: Output file path
|
158 |
+
"""
|
159 |
+
try:
|
160 |
+
# Get MongoDB collections
|
161 |
+
pages_collection = crawler.db.pages_collection
|
162 |
+
urls_collection = crawler.db.urls_collection
|
163 |
+
|
164 |
+
# Get data
|
165 |
+
pages = list(pages_collection.find({}, {'_id': 0}).limit(1000))
|
166 |
+
urls = list(urls_collection.find({}, {'_id': 0}).limit(1000))
|
167 |
+
|
168 |
+
# Prepare export data
|
169 |
+
export_data = {
|
170 |
+
'metadata': {
|
171 |
+
'crawl_duration': time.time() - crawler.stats['start_time'],
|
172 |
+
'pages_crawled': crawler.stats['pages_crawled'],
|
173 |
+
'urls_discovered': crawler.stats['urls_discovered'],
|
174 |
+
'domains_crawled': list(crawler.stats['domains_crawled']),
|
175 |
+
'exported_pages': len(pages),
|
176 |
+
'exported_urls': len(urls),
|
177 |
+
'export_timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
|
178 |
+
},
|
179 |
+
'pages': pages,
|
180 |
+
'urls': urls,
|
181 |
+
'stats': crawler.stats
|
182 |
+
}
|
183 |
+
|
184 |
+
# Convert datetime objects to strings for JSON serialization
|
185 |
+
export_data = json.loads(json.dumps(export_data, default=str))
|
186 |
+
|
187 |
+
# Write to file
|
188 |
+
with open(output_file, 'w') as f:
|
189 |
+
json.dump(export_data, f, indent=2)
|
190 |
+
|
191 |
+
logger.info(f"Exported data to {output_file}")
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Error exporting results: {e}")
|
194 |
+
|
195 |
+
|
196 |
+
def print_summary(crawler):
|
197 |
+
"""
|
198 |
+
Print a summary of the crawl
|
199 |
+
|
200 |
+
Args:
|
201 |
+
crawler: Crawler instance
|
202 |
+
"""
|
203 |
+
stats = crawler.stats
|
204 |
+
|
205 |
+
print("\n=============== CRAWL SUMMARY ===============")
|
206 |
+
print(f"Duration: {time.time() - stats['start_time']:.2f} seconds")
|
207 |
+
print(f"Pages crawled: {stats['pages_crawled']}")
|
208 |
+
print(f"Pages failed: {stats['pages_failed']}")
|
209 |
+
print(f"URLs discovered: {stats['urls_discovered']}")
|
210 |
+
print(f"URLs filtered: {stats['urls_filtered']}")
|
211 |
+
print(f"Domains crawled: {len(stats['domains_crawled'])}")
|
212 |
+
|
213 |
+
if stats['domains_crawled']:
|
214 |
+
print("\nTop domains:")
|
215 |
+
domain_counts = {}
|
216 |
+
# Count pages per domain
|
217 |
+
for page in crawler.db.pages_collection.find({}, {'domain': 1}):
|
218 |
+
domain = page.get('domain', 'unknown')
|
219 |
+
domain_counts[domain] = domain_counts.get(domain, 0) + 1
|
220 |
+
|
221 |
+
# Display top domains
|
222 |
+
for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
|
223 |
+
print(f" {domain}: {count} pages")
|
224 |
+
|
225 |
+
print("\nHTTP Status Codes:")
|
226 |
+
for status, count in sorted(stats['status_codes'].items()):
|
227 |
+
print(f" {status}: {count}")
|
228 |
+
|
229 |
+
print("\nContent Types:")
|
230 |
+
for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:5]:
|
231 |
+
print(f" {content_type}: {count}")
|
232 |
+
|
233 |
+
print("=============================================\n")
|
234 |
+
|
235 |
+
|
236 |
+
if __name__ == '__main__':
|
237 |
+
# Parse command-line arguments
|
238 |
+
args = docopt(__doc__)
|
239 |
+
|
240 |
+
duration = int(args['--time'])
|
241 |
+
workers = int(args['--workers'])
|
242 |
+
async_mode = args['--async']
|
243 |
+
|
244 |
+
try:
|
245 |
+
example_crawl(duration, workers, async_mode)
|
246 |
+
except KeyboardInterrupt:
|
247 |
+
logger.info("Example interrupted by user")
|
248 |
+
except Exception as e:
|
249 |
+
logger.error(f"Error in example: {e}")
|
250 |
+
logger.exception(e)
|
file_cleanup.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to remove all simple_crawler related files without interactive confirmation
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import logging
|
9 |
+
import shutil
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logging.basicConfig(
|
13 |
+
level=logging.INFO,
|
14 |
+
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
|
15 |
+
)
|
16 |
+
logger = logging.getLogger("file_cleanup")
|
17 |
+
|
18 |
+
def cleanup_files(dry_run=False):
|
19 |
+
"""List and remove files related to simple_crawler"""
|
20 |
+
try:
|
21 |
+
crawler_dir = os.path.dirname(os.path.abspath(__file__))
|
22 |
+
|
23 |
+
# Files directly related to simple_crawler
|
24 |
+
simple_crawler_files = [
|
25 |
+
os.path.join(crawler_dir, "simple_crawler.py"),
|
26 |
+
os.path.join(crawler_dir, "README_SIMPLE.md"),
|
27 |
+
os.path.join(crawler_dir, "simple_crawler.log"),
|
28 |
+
os.path.join(crawler_dir, "local_config.py")
|
29 |
+
]
|
30 |
+
|
31 |
+
# Check storage directories
|
32 |
+
storage_dir = os.path.join(crawler_dir, "storage")
|
33 |
+
if os.path.exists(storage_dir):
|
34 |
+
logger.info(f"Adding storage directory to removal list: {storage_dir}")
|
35 |
+
simple_crawler_files.append(storage_dir)
|
36 |
+
|
37 |
+
# Check for any log files with 'crawler' in the name
|
38 |
+
for filename in os.listdir(crawler_dir):
|
39 |
+
if ('crawler' in filename.lower() or 'crawl' in filename.lower()) and filename.endswith('.log'):
|
40 |
+
full_path = os.path.join(crawler_dir, filename)
|
41 |
+
if full_path not in simple_crawler_files:
|
42 |
+
logger.info(f"Adding log file to removal list: {filename}")
|
43 |
+
simple_crawler_files.append(full_path)
|
44 |
+
|
45 |
+
# List files that will be removed
|
46 |
+
logger.info("The following files will be removed:")
|
47 |
+
files_to_remove = []
|
48 |
+
|
49 |
+
for file_path in simple_crawler_files:
|
50 |
+
if os.path.exists(file_path):
|
51 |
+
logger.info(f" - {file_path}")
|
52 |
+
files_to_remove.append(file_path)
|
53 |
+
else:
|
54 |
+
logger.info(f" - {file_path} (not found)")
|
55 |
+
|
56 |
+
if dry_run:
|
57 |
+
logger.info("Dry run mode - no files will be removed")
|
58 |
+
return True
|
59 |
+
|
60 |
+
# Remove files and directories
|
61 |
+
for file_path in files_to_remove:
|
62 |
+
if os.path.isdir(file_path):
|
63 |
+
logger.info(f"Removing directory: {file_path}")
|
64 |
+
shutil.rmtree(file_path)
|
65 |
+
else:
|
66 |
+
logger.info(f"Removing file: {file_path}")
|
67 |
+
os.remove(file_path)
|
68 |
+
|
69 |
+
logger.info("File cleanup completed")
|
70 |
+
return True
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
logger.error(f"Error cleaning up files: {e}")
|
74 |
+
return False
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
print("Simple Crawler File Cleanup")
|
78 |
+
print("--------------------------")
|
79 |
+
print("This script will remove all files related to simple_crawler")
|
80 |
+
print()
|
81 |
+
|
82 |
+
# Check for dry-run flag
|
83 |
+
dry_run = '--dry-run' in sys.argv
|
84 |
+
|
85 |
+
if '--force' in sys.argv:
|
86 |
+
# Non-interactive mode for scripting
|
87 |
+
success = cleanup_files(dry_run)
|
88 |
+
sys.exit(0 if success else 1)
|
89 |
+
else:
|
90 |
+
# Interactive mode
|
91 |
+
if dry_run:
|
92 |
+
print("DRY RUN MODE: Files will be listed but not removed")
|
93 |
+
|
94 |
+
proceed = input("Do you want to proceed with file cleanup? (y/n): ")
|
95 |
+
if proceed.lower() != 'y':
|
96 |
+
print("Cleanup cancelled")
|
97 |
+
sys.exit(0)
|
98 |
+
|
99 |
+
success = cleanup_files(dry_run)
|
100 |
+
print(f"\nFile cleanup: {'Completed' if success else 'Failed'}")
|
frontier.py
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
URL Frontier implementation for web crawler
|
3 |
+
|
4 |
+
The URL Frontier maintains URLs to be crawled with two main goals:
|
5 |
+
1. Prioritization - Important URLs are crawled first
|
6 |
+
2. Politeness - Avoid overloading web servers with too many requests
|
7 |
+
"""
|
8 |
+
|
9 |
+
import time
|
10 |
+
import logging
|
11 |
+
import heapq
|
12 |
+
import pickle
|
13 |
+
import threading
|
14 |
+
import random
|
15 |
+
from typing import Dict, List, Tuple, Optional, Any, Set
|
16 |
+
from collections import deque
|
17 |
+
import redis
|
18 |
+
from redis.exceptions import RedisError
|
19 |
+
import mmh3
|
20 |
+
import os
|
21 |
+
import json
|
22 |
+
|
23 |
+
from models import URL, Priority, URLStatus
|
24 |
+
import config
|
25 |
+
|
26 |
+
# Import local configuration if available
|
27 |
+
try:
|
28 |
+
import local_config
|
29 |
+
# Override config settings with local settings
|
30 |
+
for key in dir(local_config):
|
31 |
+
if key.isupper():
|
32 |
+
setattr(config, key, getattr(local_config, key))
|
33 |
+
logging.info("Loaded local configuration")
|
34 |
+
except ImportError:
|
35 |
+
pass
|
36 |
+
|
37 |
+
# Configure logging
|
38 |
+
logging.basicConfig(
|
39 |
+
level=getattr(logging, config.LOG_LEVEL),
|
40 |
+
format=config.LOG_FORMAT
|
41 |
+
)
|
42 |
+
logger = logging.getLogger(__name__)
|
43 |
+
|
44 |
+
|
45 |
+
class URLFrontier:
|
46 |
+
"""
|
47 |
+
URL Frontier implementation with prioritization and politeness
|
48 |
+
|
49 |
+
Architecture:
|
50 |
+
- Front queues: Priority-based queues
|
51 |
+
- Back queues: Host-based queues for politeness
|
52 |
+
|
53 |
+
This uses Redis for persistent storage to handle large number of URLs
|
54 |
+
and enable distributed crawling. In deployment mode, it can also use
|
55 |
+
in-memory storage.
|
56 |
+
"""
|
57 |
+
|
58 |
+
def __init__(self, redis_client: Optional[redis.Redis] = None, use_memory: bool = False):
|
59 |
+
"""Initialize the URL Frontier"""
|
60 |
+
self.use_memory = use_memory
|
61 |
+
if use_memory:
|
62 |
+
# Initialize in-memory storage
|
63 |
+
self.memory_storage = {
|
64 |
+
'seen_urls': set(),
|
65 |
+
'priority_queues': [[] for _ in range(config.PRIORITY_QUEUE_NUM)],
|
66 |
+
'host_queues': [[] for _ in range(config.HOST_QUEUE_NUM)]
|
67 |
+
}
|
68 |
+
else:
|
69 |
+
# Use Redis
|
70 |
+
self.redis = redis_client or redis.from_url(config.REDIS_URI)
|
71 |
+
|
72 |
+
self.priority_count = config.PRIORITY_QUEUE_NUM # Number of priority queues
|
73 |
+
self.host_count = config.HOST_QUEUE_NUM # Number of host queues
|
74 |
+
self.url_seen_key = "webcrawler:url_seen" # Bloom filter for seen URLs
|
75 |
+
self.priority_queue_key_prefix = "webcrawler:priority_queue:"
|
76 |
+
self.host_queue_key_prefix = "webcrawler:host_queue:"
|
77 |
+
self.lock = threading.RLock() # Thread-safe operations
|
78 |
+
|
79 |
+
# Simple mode uses Redis Set instead of Bloom filter
|
80 |
+
self.use_simple_mode = getattr(config, 'USE_SIMPLE_URL_SEEN', False)
|
81 |
+
logger.info(f"URLFrontier using simple mode: {self.use_simple_mode}")
|
82 |
+
|
83 |
+
# Ensure directory for checkpoint exists
|
84 |
+
if not os.path.exists(config.STORAGE_PATH):
|
85 |
+
os.makedirs(config.STORAGE_PATH)
|
86 |
+
|
87 |
+
# Initialize URL seen storage
|
88 |
+
if not self.use_memory:
|
89 |
+
self._init_url_seen()
|
90 |
+
|
91 |
+
def _init_url_seen(self):
|
92 |
+
"""Initialize URL seen storage based on configuration"""
|
93 |
+
try:
|
94 |
+
# If using simple mode, just use a Redis set
|
95 |
+
if self.use_simple_mode:
|
96 |
+
if not self.redis.exists(self.url_seen_key):
|
97 |
+
logger.info("Initializing URL seen set")
|
98 |
+
self.redis.sadd(self.url_seen_key, "initialized")
|
99 |
+
return
|
100 |
+
|
101 |
+
# Try to use Bloom filter
|
102 |
+
if not self.redis.exists(self.url_seen_key):
|
103 |
+
logger.info("Initializing URL seen bloom filter")
|
104 |
+
try:
|
105 |
+
# Use a bloom filter with 100 million items and 0.01 false positive rate
|
106 |
+
# This requires approximately 119.5 MB of memory
|
107 |
+
self.redis.execute_command("BF.RESERVE", self.url_seen_key, 0.01, 100000000)
|
108 |
+
except RedisError as e:
|
109 |
+
logger.error(f"Failed to initialize bloom filter: {e}")
|
110 |
+
logger.info("Falling back to simple set for URL seen detection")
|
111 |
+
self.use_simple_mode = True
|
112 |
+
# Initialize a set instead
|
113 |
+
if not self.redis.exists(self.url_seen_key):
|
114 |
+
self.redis.sadd(self.url_seen_key, "initialized")
|
115 |
+
except RedisError as e:
|
116 |
+
logger.error(f"Error initializing URL seen: {e}")
|
117 |
+
# Fallback to set if bloom filter is not available
|
118 |
+
self.use_simple_mode = True
|
119 |
+
if not self.redis.exists(self.url_seen_key):
|
120 |
+
self.redis.sadd(self.url_seen_key, "initialized")
|
121 |
+
|
122 |
+
def add_url(self, url_obj: URL) -> bool:
|
123 |
+
"""Add a URL to the frontier"""
|
124 |
+
with self.lock:
|
125 |
+
url = url_obj.url
|
126 |
+
|
127 |
+
# Check if URL has been seen
|
128 |
+
if self.use_memory:
|
129 |
+
if url in self.memory_storage['seen_urls']:
|
130 |
+
return False
|
131 |
+
self.memory_storage['seen_urls'].add(url)
|
132 |
+
else:
|
133 |
+
if self.use_simple_mode:
|
134 |
+
if self.redis.sismember(self.url_seen_key, url):
|
135 |
+
return False
|
136 |
+
self.redis.sadd(self.url_seen_key, url)
|
137 |
+
else:
|
138 |
+
if self._check_url_seen(url):
|
139 |
+
return False
|
140 |
+
self._mark_url_seen(url)
|
141 |
+
|
142 |
+
# Add to priority queue
|
143 |
+
priority_index = url_obj.priority.value % self.priority_count
|
144 |
+
if self.use_memory:
|
145 |
+
self.memory_storage['priority_queues'][priority_index].append(url_obj)
|
146 |
+
else:
|
147 |
+
priority_key = f"{self.priority_queue_key_prefix}{priority_index}"
|
148 |
+
self.redis.rpush(priority_key, url_obj.json())
|
149 |
+
|
150 |
+
return True
|
151 |
+
|
152 |
+
def get_next_url(self) -> Optional[URL]:
|
153 |
+
"""Get the next URL to crawl"""
|
154 |
+
with self.lock:
|
155 |
+
# Try each priority queue
|
156 |
+
for i in range(self.priority_count):
|
157 |
+
if self.use_memory:
|
158 |
+
queue = self.memory_storage['priority_queues'][i]
|
159 |
+
if queue:
|
160 |
+
return queue.pop(0)
|
161 |
+
else:
|
162 |
+
priority_key = f"{self.priority_queue_key_prefix}{i}"
|
163 |
+
url_data = self.redis.lpop(priority_key)
|
164 |
+
if url_data:
|
165 |
+
return URL.parse_raw(url_data)
|
166 |
+
return None
|
167 |
+
|
168 |
+
def _check_url_seen(self, url: str) -> bool:
|
169 |
+
"""Check if URL has been seen"""
|
170 |
+
if self.use_memory:
|
171 |
+
return url in self.memory_storage['seen_urls']
|
172 |
+
elif self.use_simple_mode:
|
173 |
+
return self.redis.sismember(self.url_seen_key, url)
|
174 |
+
else:
|
175 |
+
# Using Redis Bloom filter
|
176 |
+
return bool(self.redis.getbit(self.url_seen_key, self._hash_url(url)))
|
177 |
+
|
178 |
+
def _mark_url_seen(self, url: str) -> None:
|
179 |
+
"""Mark URL as seen"""
|
180 |
+
if self.use_memory:
|
181 |
+
self.memory_storage['seen_urls'].add(url)
|
182 |
+
elif self.use_simple_mode:
|
183 |
+
self.redis.sadd(self.url_seen_key, url)
|
184 |
+
else:
|
185 |
+
# Using Redis Bloom filter
|
186 |
+
self.redis.setbit(self.url_seen_key, self._hash_url(url), 1)
|
187 |
+
|
188 |
+
def _hash_url(self, url: str) -> int:
|
189 |
+
"""Hash URL for Bloom filter"""
|
190 |
+
return hash(url) % (1 << 32) # 32-bit hash
|
191 |
+
|
192 |
+
def size(self) -> int:
|
193 |
+
"""Get the total size of all queues"""
|
194 |
+
if self.use_memory:
|
195 |
+
return sum(len(q) for q in self.memory_storage['priority_queues'])
|
196 |
+
else:
|
197 |
+
total = 0
|
198 |
+
for i in range(self.priority_count):
|
199 |
+
priority_key = f"{self.priority_queue_key_prefix}{i}"
|
200 |
+
total += self.redis.llen(priority_key)
|
201 |
+
return total
|
202 |
+
|
203 |
+
def get_stats(self) -> Dict[str, Any]:
|
204 |
+
"""Get frontier statistics"""
|
205 |
+
stats = {
|
206 |
+
"size": self.size(),
|
207 |
+
"priority_queues": {},
|
208 |
+
"host_queues": {},
|
209 |
+
}
|
210 |
+
|
211 |
+
try:
|
212 |
+
# Get priority queue stats
|
213 |
+
for priority in range(1, self.priority_count + 1):
|
214 |
+
queue_key = f"{self.priority_queue_key_prefix}{priority}"
|
215 |
+
stats["priority_queues"][f"priority_{priority}"] = self.redis.llen(queue_key)
|
216 |
+
|
217 |
+
# Get host queue stats (just count total host queues with items)
|
218 |
+
host_queue_count = 0
|
219 |
+
for host_id in range(self.host_count):
|
220 |
+
queue_key = f"{self.host_queue_key_prefix}{host_id}"
|
221 |
+
if self.redis.llen(queue_key) > 0:
|
222 |
+
host_queue_count += 1
|
223 |
+
|
224 |
+
stats["host_queues"]["count_with_items"] = host_queue_count
|
225 |
+
|
226 |
+
# Add URLs seen count if using simple mode
|
227 |
+
if self.use_simple_mode:
|
228 |
+
stats["urls_seen"] = self.redis.scard(self.url_seen_key)
|
229 |
+
|
230 |
+
return stats
|
231 |
+
except RedisError as e:
|
232 |
+
logger.error(f"Error getting frontier stats: {e}")
|
233 |
+
return stats
|
234 |
+
|
235 |
+
def checkpoint(self) -> bool:
|
236 |
+
"""Save frontier state"""
|
237 |
+
if self.use_memory:
|
238 |
+
# No need to checkpoint in-memory storage
|
239 |
+
return True
|
240 |
+
|
241 |
+
try:
|
242 |
+
# Save priority queues
|
243 |
+
for i in range(self.priority_count):
|
244 |
+
priority_key = f"{self.priority_queue_key_prefix}{i}"
|
245 |
+
queue_data = []
|
246 |
+
while True:
|
247 |
+
url_data = self.redis.lpop(priority_key)
|
248 |
+
if not url_data:
|
249 |
+
break
|
250 |
+
queue_data.append(url_data)
|
251 |
+
|
252 |
+
# Save to file
|
253 |
+
checkpoint_file = os.path.join(config.STORAGE_PATH, f"priority_queue_{i}.json")
|
254 |
+
with open(checkpoint_file, 'w') as f:
|
255 |
+
json.dump(queue_data, f)
|
256 |
+
|
257 |
+
# Restore queue
|
258 |
+
for url_data in reversed(queue_data):
|
259 |
+
self.redis.rpush(priority_key, url_data)
|
260 |
+
|
261 |
+
return True
|
262 |
+
except Exception as e:
|
263 |
+
logger.error(f"Error creating frontier checkpoint: {e}")
|
264 |
+
return False
|
265 |
+
|
266 |
+
def restore(self) -> bool:
|
267 |
+
"""Restore frontier state"""
|
268 |
+
if self.use_memory:
|
269 |
+
# No need to restore in-memory storage
|
270 |
+
return True
|
271 |
+
|
272 |
+
try:
|
273 |
+
# Restore priority queues
|
274 |
+
for i in range(self.priority_count):
|
275 |
+
checkpoint_file = os.path.join(config.STORAGE_PATH, f"priority_queue_{i}.json")
|
276 |
+
if os.path.exists(checkpoint_file):
|
277 |
+
with open(checkpoint_file, 'r') as f:
|
278 |
+
queue_data = json.load(f)
|
279 |
+
|
280 |
+
# Clear existing queue
|
281 |
+
priority_key = f"{self.priority_queue_key_prefix}{i}"
|
282 |
+
self.redis.delete(priority_key)
|
283 |
+
|
284 |
+
# Restore queue
|
285 |
+
for url_data in queue_data:
|
286 |
+
self.redis.rpush(priority_key, url_data)
|
287 |
+
|
288 |
+
return True
|
289 |
+
except Exception as e:
|
290 |
+
logger.error(f"Error restoring frontier checkpoint: {e}")
|
291 |
+
return False
|
292 |
+
|
293 |
+
def clear(self) -> bool:
|
294 |
+
"""
|
295 |
+
Clear all queues in the frontier
|
296 |
+
|
297 |
+
Returns:
|
298 |
+
bool: True if successful, False otherwise
|
299 |
+
"""
|
300 |
+
try:
|
301 |
+
# Delete all queue keys
|
302 |
+
keys = []
|
303 |
+
for priority in range(1, self.priority_count + 1):
|
304 |
+
keys.append(f"{self.priority_queue_key_prefix}{priority}")
|
305 |
+
|
306 |
+
for host_id in range(self.host_count):
|
307 |
+
keys.append(f"{self.host_queue_key_prefix}{host_id}")
|
308 |
+
|
309 |
+
if keys:
|
310 |
+
self.redis.delete(*keys)
|
311 |
+
|
312 |
+
# Reset URL seen filter (optional)
|
313 |
+
self.redis.delete(self.url_seen_key)
|
314 |
+
|
315 |
+
logger.info("Frontier cleared")
|
316 |
+
return True
|
317 |
+
except Exception as e:
|
318 |
+
logger.error(f"Error clearing frontier: {e}")
|
319 |
+
return False
|
models.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Data models for the web crawler
|
3 |
+
"""
|
4 |
+
|
5 |
+
import time
|
6 |
+
import hashlib
|
7 |
+
import tldextract
|
8 |
+
from urllib.parse import urlparse, urljoin, urlunparse
|
9 |
+
from datetime import datetime
|
10 |
+
from typing import Dict, List, Any, Optional, Set, Tuple
|
11 |
+
from pydantic import BaseModel, Field, HttpUrl, validator
|
12 |
+
from enum import Enum
|
13 |
+
import logging
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
class URLStatus(str, Enum):
|
19 |
+
"""Status of a URL in the crawl process"""
|
20 |
+
PENDING = "pending" # Not yet processed
|
21 |
+
IN_PROGRESS = "in_progress" # Currently being processed
|
22 |
+
COMPLETED = "completed" # Successfully processed
|
23 |
+
FAILED = "failed" # Failed to process
|
24 |
+
FILTERED = "filtered" # Filtered out based on rules
|
25 |
+
ROBOTSTXT_EXCLUDED = "robotstxt_excluded" # Excluded by robots.txt
|
26 |
+
|
27 |
+
|
28 |
+
class Priority(int, Enum):
|
29 |
+
"""Priority levels for URLs"""
|
30 |
+
VERY_HIGH = 1
|
31 |
+
HIGH = 2
|
32 |
+
MEDIUM = 3
|
33 |
+
LOW = 4
|
34 |
+
VERY_LOW = 5
|
35 |
+
|
36 |
+
|
37 |
+
class URL(BaseModel):
|
38 |
+
"""URL model with metadata for crawling"""
|
39 |
+
url: str
|
40 |
+
normalized_url: str = "" # Normalized version of the URL
|
41 |
+
domain: str = "" # Domain extracted from the URL
|
42 |
+
depth: int = 0 # Depth from seed URL
|
43 |
+
discovered_at: datetime = Field(default_factory=datetime.now)
|
44 |
+
last_crawled: Optional[datetime] = None
|
45 |
+
completed_at: Optional[datetime] = None # When the URL was completed/failed
|
46 |
+
status: URLStatus = URLStatus.PENDING
|
47 |
+
priority: Priority = Priority.MEDIUM
|
48 |
+
parent_url: Optional[str] = None # URL that led to this URL
|
49 |
+
retries: int = 0 # Number of times retried
|
50 |
+
error: Optional[str] = None # Error message if failed
|
51 |
+
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
52 |
+
|
53 |
+
@validator("normalized_url", pre=True, always=True)
|
54 |
+
def set_normalized_url(cls, v, values):
|
55 |
+
"""Normalize the URL if not already set"""
|
56 |
+
if not v and "url" in values:
|
57 |
+
return normalize_url(values["url"])
|
58 |
+
return v
|
59 |
+
|
60 |
+
@validator("domain", pre=True, always=True)
|
61 |
+
def set_domain(cls, v, values):
|
62 |
+
"""Extract domain from URL if not already set"""
|
63 |
+
if not v and "url" in values:
|
64 |
+
parsed = tldextract.extract(values["url"])
|
65 |
+
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
66 |
+
return v
|
67 |
+
|
68 |
+
|
69 |
+
class RobotsInfo(BaseModel):
|
70 |
+
"""Information from robots.txt for a domain"""
|
71 |
+
domain: str
|
72 |
+
allowed: bool = True # Whether crawling is allowed
|
73 |
+
crawl_delay: Optional[float] = None # Crawl delay in seconds
|
74 |
+
last_fetched: datetime = Field(default_factory=datetime.now)
|
75 |
+
user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
|
76 |
+
status_code: Optional[int] = None # HTTP status code when fetching robots.txt
|
77 |
+
|
78 |
+
|
79 |
+
class Page(BaseModel):
|
80 |
+
"""Web page model with content and metadata"""
|
81 |
+
url: str
|
82 |
+
status_code: int
|
83 |
+
content: str # HTML content
|
84 |
+
content_type: str
|
85 |
+
content_length: int
|
86 |
+
content_hash: str # Hash of the content for duplicate detection
|
87 |
+
headers: Dict[str, str] = Field(default_factory=dict)
|
88 |
+
links: List[str] = Field(default_factory=list) # Links extracted from the page
|
89 |
+
crawled_at: datetime = Field(default_factory=datetime.now)
|
90 |
+
redirect_url: Optional[str] = None # URL after redirects
|
91 |
+
elapsed_time: float = 0.0 # Time taken to fetch the page
|
92 |
+
is_duplicate: bool = False # Whether this is duplicate content
|
93 |
+
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
94 |
+
|
95 |
+
|
96 |
+
class DomainStats(BaseModel):
|
97 |
+
"""Statistics for a domain"""
|
98 |
+
domain: str
|
99 |
+
pages_crawled: int = 0
|
100 |
+
successful_crawls: int = 0
|
101 |
+
failed_crawls: int = 0
|
102 |
+
last_crawled: Optional[datetime] = None
|
103 |
+
robots_info: Optional[RobotsInfo] = None
|
104 |
+
crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
|
105 |
+
errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
|
106 |
+
|
107 |
+
|
108 |
+
def normalize_url(url: str) -> str:
|
109 |
+
"""
|
110 |
+
Normalize a URL by:
|
111 |
+
1. Converting to lowercase
|
112 |
+
2. Removing fragments
|
113 |
+
3. Removing default ports
|
114 |
+
4. Sorting query parameters
|
115 |
+
5. Removing trailing slashes
|
116 |
+
6. Adding scheme if missing
|
117 |
+
"""
|
118 |
+
try:
|
119 |
+
# Parse URL
|
120 |
+
parsed = urlparse(url)
|
121 |
+
|
122 |
+
# Add scheme if missing
|
123 |
+
if not parsed.scheme:
|
124 |
+
url = 'http://' + url
|
125 |
+
parsed = urlparse(url)
|
126 |
+
|
127 |
+
# Get domain and path
|
128 |
+
domain = parsed.netloc.lower()
|
129 |
+
path = parsed.path
|
130 |
+
|
131 |
+
# Remove default ports
|
132 |
+
if ':' in domain:
|
133 |
+
domain_parts = domain.split(':')
|
134 |
+
if (parsed.scheme == 'http' and domain_parts[1] == '80') or \
|
135 |
+
(parsed.scheme == 'https' and domain_parts[1] == '443'):
|
136 |
+
domain = domain_parts[0]
|
137 |
+
|
138 |
+
# Sort query parameters
|
139 |
+
query = parsed.query
|
140 |
+
if query:
|
141 |
+
query_params = sorted(query.split('&'))
|
142 |
+
query = '&'.join(query_params)
|
143 |
+
|
144 |
+
# Remove trailing slashes from path
|
145 |
+
while path.endswith('/') and len(path) > 1:
|
146 |
+
path = path[:-1]
|
147 |
+
|
148 |
+
# Add leading slash if missing
|
149 |
+
if not path:
|
150 |
+
path = '/'
|
151 |
+
|
152 |
+
# Reconstruct URL
|
153 |
+
normalized = f"{parsed.scheme}://{domain}{path}"
|
154 |
+
if query:
|
155 |
+
normalized += f"?{query}"
|
156 |
+
|
157 |
+
logger.debug(f"Normalized URL: {url} -> {normalized}")
|
158 |
+
return normalized
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Error normalizing URL {url}: {e}")
|
162 |
+
return url
|
163 |
+
|
164 |
+
|
165 |
+
def calculate_content_hash(content: str) -> str:
|
166 |
+
"""Calculate hash of content for duplicate detection"""
|
167 |
+
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
mongo_cleanup.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to remove all web crawler data from MongoDB without interactive confirmation
|
4 |
+
"""
|
5 |
+
|
6 |
+
import logging
|
7 |
+
from pymongo import MongoClient
|
8 |
+
import sys
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO,
|
13 |
+
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
|
14 |
+
)
|
15 |
+
logger = logging.getLogger("mongo_cleanup")
|
16 |
+
|
17 |
+
def cleanup_mongodb():
|
18 |
+
"""Remove all web crawler data from MongoDB"""
|
19 |
+
try:
|
20 |
+
# Connect to MongoDB
|
21 |
+
logger.info("Connecting to MongoDB...")
|
22 |
+
client = MongoClient("mongodb://localhost:27017/")
|
23 |
+
|
24 |
+
# Access crawler database
|
25 |
+
db = client["crawler"]
|
26 |
+
|
27 |
+
# List and drop all collections
|
28 |
+
collections = db.list_collection_names()
|
29 |
+
|
30 |
+
if not collections:
|
31 |
+
logger.info("No collections found in the crawler database")
|
32 |
+
else:
|
33 |
+
logger.info(f"Found {len(collections)} collections to drop: {collections}")
|
34 |
+
|
35 |
+
for collection in collections:
|
36 |
+
logger.info(f"Dropping collection: {collection}")
|
37 |
+
db[collection].drop()
|
38 |
+
|
39 |
+
logger.info("All crawler collections dropped successfully")
|
40 |
+
|
41 |
+
# Optionally drop the entire database
|
42 |
+
logger.info("Dropping entire crawler database")
|
43 |
+
client.drop_database("crawler")
|
44 |
+
|
45 |
+
# Check for any URLs collection in other databases that might be related
|
46 |
+
all_dbs = client.list_database_names()
|
47 |
+
for db_name in all_dbs:
|
48 |
+
if db_name in ['admin', 'config', 'local']:
|
49 |
+
continue
|
50 |
+
|
51 |
+
db = client[db_name]
|
52 |
+
if 'urls' in db.list_collection_names() or 'pages' in db.list_collection_names():
|
53 |
+
logger.info(f"Found crawler-related collections in database: {db_name}")
|
54 |
+
|
55 |
+
# Ask for confirmation before dropping collections in other databases
|
56 |
+
for collection in ['urls', 'pages', 'domains', 'stats']:
|
57 |
+
if collection in db.list_collection_names():
|
58 |
+
logger.info(f"Dropping collection {db_name}.{collection}")
|
59 |
+
db[collection].drop()
|
60 |
+
|
61 |
+
logger.info("MongoDB cleanup completed successfully")
|
62 |
+
return True
|
63 |
+
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(f"Error cleaning up MongoDB: {e}")
|
66 |
+
return False
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
print("MongoDB Crawler Data Cleanup")
|
70 |
+
print("--------------------------")
|
71 |
+
print("This script will remove all web crawler collections from MongoDB")
|
72 |
+
print()
|
73 |
+
|
74 |
+
if len(sys.argv) > 1 and sys.argv[1] == '--force':
|
75 |
+
# Non-interactive mode for scripting
|
76 |
+
success = cleanup_mongodb()
|
77 |
+
sys.exit(0 if success else 1)
|
78 |
+
else:
|
79 |
+
# Interactive mode
|
80 |
+
proceed = input("Do you want to proceed with MongoDB cleanup? (y/n): ")
|
81 |
+
if proceed.lower() != 'y':
|
82 |
+
print("Cleanup cancelled")
|
83 |
+
sys.exit(0)
|
84 |
+
|
85 |
+
success = cleanup_mongodb()
|
86 |
+
print(f"\nMongoDB cleanup: {'Completed' if success else 'Failed'}")
|
parser.py
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
HTML Parser and URL Extractor component for web crawler
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import re
|
7 |
+
from typing import Dict, List, Set, Tuple, Optional, Any
|
8 |
+
from urllib.parse import urlparse, urljoin, unquote
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import tldextract
|
11 |
+
import hashlib
|
12 |
+
import os
|
13 |
+
|
14 |
+
from models import URL, Page, Priority, normalize_url
|
15 |
+
import config
|
16 |
+
|
17 |
+
# Configure logging
|
18 |
+
logging.basicConfig(
|
19 |
+
level=getattr(logging, config.LOG_LEVEL),
|
20 |
+
format=config.LOG_FORMAT
|
21 |
+
)
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
|
25 |
+
class HTMLParser:
|
26 |
+
"""
|
27 |
+
Parses HTML content and extracts URLs and other information
|
28 |
+
"""
|
29 |
+
|
30 |
+
def __init__(self):
|
31 |
+
"""Initialize HTML parser"""
|
32 |
+
# Compile URL filter regex patterns for efficiency
|
33 |
+
self.url_filters = [re.compile(pattern) for pattern in config.URL_FILTERS]
|
34 |
+
|
35 |
+
def parse(self, page: Page, base_url: Optional[str] = None) -> Tuple[List[str], Dict[str, Any]]:
|
36 |
+
"""
|
37 |
+
Parse HTML content and extract URLs and metadata
|
38 |
+
|
39 |
+
Args:
|
40 |
+
page: Page object containing HTML content
|
41 |
+
base_url: Base URL for resolving relative links (defaults to page URL)
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
Tuple of (extracted URLs, metadata)
|
45 |
+
"""
|
46 |
+
if not page or not page.content:
|
47 |
+
return [], {}
|
48 |
+
|
49 |
+
# Use page URL as base URL if not provided
|
50 |
+
if not base_url:
|
51 |
+
base_url = page.url
|
52 |
+
|
53 |
+
# Parse HTML content
|
54 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
55 |
+
|
56 |
+
# Extract URLs
|
57 |
+
urls = self._extract_urls(soup, base_url)
|
58 |
+
|
59 |
+
# Extract metadata
|
60 |
+
metadata = self._extract_metadata(soup)
|
61 |
+
|
62 |
+
return urls, metadata
|
63 |
+
|
64 |
+
def _extract_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]:
|
65 |
+
"""
|
66 |
+
Extract and normalize URLs from HTML content
|
67 |
+
|
68 |
+
Args:
|
69 |
+
soup: BeautifulSoup object
|
70 |
+
base_url: Base URL for resolving relative links
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
List of normalized URLs
|
74 |
+
"""
|
75 |
+
urls = set()
|
76 |
+
all_urls = set() # Track all URLs before filtering
|
77 |
+
filtered_urls = set() # Track filtered URLs
|
78 |
+
|
79 |
+
logger.debug(f"Extracting URLs from page: {base_url}")
|
80 |
+
|
81 |
+
# Extract URLs from <a> tags
|
82 |
+
for link in soup.find_all('a', href=True):
|
83 |
+
href = link['href'].strip()
|
84 |
+
if href and not href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
|
85 |
+
# Resolve relative URLs
|
86 |
+
try:
|
87 |
+
absolute_url = urljoin(base_url, href)
|
88 |
+
all_urls.add(absolute_url)
|
89 |
+
# Normalize URL
|
90 |
+
normalized_url = normalize_url(absolute_url)
|
91 |
+
# Apply URL filters
|
92 |
+
if self._should_allow_url(normalized_url):
|
93 |
+
urls.add(normalized_url)
|
94 |
+
else:
|
95 |
+
filtered_urls.add(normalized_url)
|
96 |
+
except Exception as e:
|
97 |
+
logger.debug(f"Error processing URL {href}: {e}")
|
98 |
+
|
99 |
+
# Extract URLs from other elements like <iframe>, <frame>, <img>, etc.
|
100 |
+
for tag_name, attr in [('frame', 'src'), ('iframe', 'src'), ('img', 'src'),
|
101 |
+
('link', 'href'), ('script', 'src'), ('area', 'href')]:
|
102 |
+
for tag in soup.find_all(tag_name, attrs={attr: True}):
|
103 |
+
url = tag[attr].strip()
|
104 |
+
if url and not url.startswith(('#', 'javascript:', 'data:', 'mailto:', 'tel:')):
|
105 |
+
try:
|
106 |
+
absolute_url = urljoin(base_url, url)
|
107 |
+
all_urls.add(absolute_url)
|
108 |
+
normalized_url = normalize_url(absolute_url)
|
109 |
+
if self._should_allow_url(normalized_url):
|
110 |
+
urls.add(normalized_url)
|
111 |
+
else:
|
112 |
+
filtered_urls.add(normalized_url)
|
113 |
+
except Exception as e:
|
114 |
+
logger.debug(f"Error processing URL {url}: {e}")
|
115 |
+
|
116 |
+
# Log statistics
|
117 |
+
logger.debug(f"Found {len(all_urls)} total URLs")
|
118 |
+
logger.debug(f"Filtered {len(filtered_urls)} URLs")
|
119 |
+
logger.debug(f"Accepted {len(urls)} URLs")
|
120 |
+
|
121 |
+
# Log some example filtered URLs for debugging
|
122 |
+
if filtered_urls:
|
123 |
+
sample_filtered = list(filtered_urls)[:5]
|
124 |
+
logger.debug(f"Sample filtered URLs: {sample_filtered}")
|
125 |
+
|
126 |
+
# Return list of unique URLs
|
127 |
+
return list(urls)
|
128 |
+
|
129 |
+
def _should_allow_url(self, url: str) -> bool:
|
130 |
+
"""
|
131 |
+
Check if URL should be allowed based on filters
|
132 |
+
|
133 |
+
Args:
|
134 |
+
url: URL to check
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
True if URL should be allowed, False otherwise
|
138 |
+
"""
|
139 |
+
try:
|
140 |
+
parsed = urlparse(url)
|
141 |
+
|
142 |
+
# Check scheme
|
143 |
+
if parsed.scheme not in config.ALLOWED_SCHEMES:
|
144 |
+
logger.debug(f"URL filtered - invalid scheme: {url}")
|
145 |
+
return False
|
146 |
+
|
147 |
+
# Check domain restrictions
|
148 |
+
domain = self._extract_domain(url)
|
149 |
+
|
150 |
+
# Check allowed domains if set
|
151 |
+
if config.ALLOWED_DOMAINS and domain not in config.ALLOWED_DOMAINS:
|
152 |
+
logger.debug(f"URL filtered - domain not allowed: {url} (domain: {domain}, allowed: {config.ALLOWED_DOMAINS})")
|
153 |
+
return False
|
154 |
+
|
155 |
+
# Check excluded domains
|
156 |
+
if domain in config.EXCLUDED_DOMAINS:
|
157 |
+
logger.debug(f"URL filtered - domain excluded: {url}")
|
158 |
+
return False
|
159 |
+
|
160 |
+
# Check URL filters
|
161 |
+
for pattern in self.url_filters:
|
162 |
+
if pattern.match(url):
|
163 |
+
logger.debug(f"URL filtered - pattern match: {url}")
|
164 |
+
return False
|
165 |
+
|
166 |
+
return True
|
167 |
+
|
168 |
+
except Exception as e:
|
169 |
+
logger.debug(f"Error checking URL {url}: {e}")
|
170 |
+
return False
|
171 |
+
|
172 |
+
def _extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
173 |
+
"""
|
174 |
+
Extract metadata from HTML content
|
175 |
+
|
176 |
+
Args:
|
177 |
+
soup: BeautifulSoup object
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
Dictionary of metadata
|
181 |
+
"""
|
182 |
+
metadata = {}
|
183 |
+
|
184 |
+
# Extract title
|
185 |
+
title_tag = soup.find('title')
|
186 |
+
if title_tag and title_tag.string:
|
187 |
+
metadata['title'] = title_tag.string.strip()
|
188 |
+
|
189 |
+
# Extract meta description
|
190 |
+
description_tag = soup.find('meta', attrs={'name': 'description'})
|
191 |
+
if description_tag and description_tag.get('content'):
|
192 |
+
metadata['description'] = description_tag['content'].strip()
|
193 |
+
|
194 |
+
# Extract meta keywords
|
195 |
+
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
196 |
+
if keywords_tag and keywords_tag.get('content'):
|
197 |
+
metadata['keywords'] = [k.strip() for k in keywords_tag['content'].split(',')]
|
198 |
+
|
199 |
+
# Extract canonical URL
|
200 |
+
canonical_tag = soup.find('link', attrs={'rel': 'canonical'})
|
201 |
+
if canonical_tag and canonical_tag.get('href'):
|
202 |
+
metadata['canonical_url'] = canonical_tag['href'].strip()
|
203 |
+
|
204 |
+
# Extract robots meta
|
205 |
+
robots_tag = soup.find('meta', attrs={'name': 'robots'})
|
206 |
+
if robots_tag and robots_tag.get('content'):
|
207 |
+
metadata['robots'] = robots_tag['content'].strip()
|
208 |
+
|
209 |
+
# Extract Open Graph metadata
|
210 |
+
og_metadata = {}
|
211 |
+
for meta_tag in soup.find_all('meta', attrs={'property': re.compile('^og:')}):
|
212 |
+
if meta_tag.get('content'):
|
213 |
+
property_name = meta_tag['property'][3:] # Remove 'og:' prefix
|
214 |
+
og_metadata[property_name] = meta_tag['content'].strip()
|
215 |
+
|
216 |
+
if og_metadata:
|
217 |
+
metadata['open_graph'] = og_metadata
|
218 |
+
|
219 |
+
# Extract Twitter Card metadata
|
220 |
+
twitter_metadata = {}
|
221 |
+
for meta_tag in soup.find_all('meta', attrs={'name': re.compile('^twitter:')}):
|
222 |
+
if meta_tag.get('content'):
|
223 |
+
property_name = meta_tag['name'][8:] # Remove 'twitter:' prefix
|
224 |
+
twitter_metadata[property_name] = meta_tag['content'].strip()
|
225 |
+
|
226 |
+
if twitter_metadata:
|
227 |
+
metadata['twitter_card'] = twitter_metadata
|
228 |
+
|
229 |
+
# Extract schema.org structured data (JSON-LD)
|
230 |
+
schema_metadata = []
|
231 |
+
for script in soup.find_all('script', attrs={'type': 'application/ld+json'}):
|
232 |
+
if script.string:
|
233 |
+
try:
|
234 |
+
import json
|
235 |
+
schema_data = json.loads(script.string)
|
236 |
+
schema_metadata.append(schema_data)
|
237 |
+
except Exception as e:
|
238 |
+
logger.debug(f"Error parsing JSON-LD: {e}")
|
239 |
+
|
240 |
+
if schema_metadata:
|
241 |
+
metadata['structured_data'] = schema_metadata
|
242 |
+
|
243 |
+
# Extract text content statistics
|
244 |
+
text_content = soup.get_text(separator=' ', strip=True)
|
245 |
+
if text_content:
|
246 |
+
word_count = len(text_content.split())
|
247 |
+
metadata['word_count'] = word_count
|
248 |
+
metadata['text_length'] = len(text_content)
|
249 |
+
|
250 |
+
return metadata
|
251 |
+
|
252 |
+
def _extract_domain(self, url: str) -> str:
|
253 |
+
"""Extract domain from URL"""
|
254 |
+
parsed = tldextract.extract(url)
|
255 |
+
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
256 |
+
|
257 |
+
def calculate_priority(self, url: str, metadata: Dict[str, Any]) -> Priority:
|
258 |
+
"""
|
259 |
+
Calculate priority for a URL based on various factors
|
260 |
+
|
261 |
+
Args:
|
262 |
+
url: URL to calculate priority for
|
263 |
+
metadata: Metadata extracted from the page
|
264 |
+
|
265 |
+
Returns:
|
266 |
+
Priority enum value
|
267 |
+
"""
|
268 |
+
# Default priority
|
269 |
+
priority = Priority.MEDIUM
|
270 |
+
|
271 |
+
try:
|
272 |
+
# Extract path depth
|
273 |
+
parsed = urlparse(url)
|
274 |
+
path = parsed.path
|
275 |
+
depth = len([p for p in path.split('/') if p])
|
276 |
+
|
277 |
+
# Prioritize URLs with shorter paths
|
278 |
+
if depth <= 1:
|
279 |
+
priority = Priority.HIGH
|
280 |
+
elif depth <= 3:
|
281 |
+
priority = Priority.MEDIUM
|
282 |
+
else:
|
283 |
+
priority = Priority.LOW
|
284 |
+
|
285 |
+
# Prioritize URLs with certain keywords in path
|
286 |
+
if re.search(r'(article|blog|news|post)', path, re.IGNORECASE):
|
287 |
+
priority = Priority.HIGH
|
288 |
+
|
289 |
+
# Deprioritize URLs with pagination patterns
|
290 |
+
if re.search(r'(page|p|pg)=\d+', url, re.IGNORECASE):
|
291 |
+
priority = Priority.LOW
|
292 |
+
|
293 |
+
# Check metadata
|
294 |
+
if metadata:
|
295 |
+
# Prioritize based on title
|
296 |
+
title = metadata.get('title', '')
|
297 |
+
if title and len(title) > 10:
|
298 |
+
priority = min(priority, Priority.MEDIUM) # Raise priority if it's lower
|
299 |
+
|
300 |
+
# Prioritize based on description
|
301 |
+
description = metadata.get('description', '')
|
302 |
+
if description and len(description) > 50:
|
303 |
+
priority = min(priority, Priority.MEDIUM) # Raise priority if it's lower
|
304 |
+
|
305 |
+
# Prioritize based on word count
|
306 |
+
word_count = metadata.get('word_count', 0)
|
307 |
+
if word_count > 1000:
|
308 |
+
priority = min(priority, Priority.HIGH) # High priority for content-rich pages
|
309 |
+
elif word_count > 500:
|
310 |
+
priority = min(priority, Priority.MEDIUM)
|
311 |
+
|
312 |
+
return priority
|
313 |
+
|
314 |
+
except Exception as e:
|
315 |
+
logger.debug(f"Error calculating priority for URL {url}: {e}")
|
316 |
+
return Priority.MEDIUM
|
requirements.txt
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
requests==2.31.0
|
3 |
+
beautifulsoup4==4.12.3
|
4 |
+
aiohttp==3.9.3
|
5 |
+
lxml==4.9.2
|
6 |
+
html5lib==1.1
|
7 |
+
pydantic==1.10.7
|
8 |
+
pymongo==4.6.1
|
9 |
+
redis==5.0.1
|
10 |
+
boto3==1.26.123
|
11 |
+
docopt==0.6.2
|
12 |
+
|
13 |
+
# URL and DNS handling
|
14 |
+
dnspython==2.3.0
|
15 |
+
tldextract==5.1.1
|
16 |
+
validators==0.20.0
|
17 |
+
robotexclusionrulesparser==1.7.1
|
18 |
+
urllib3==1.26.15
|
19 |
+
|
20 |
+
# Monitoring and metrics
|
21 |
+
prometheus-client==0.16.0
|
22 |
+
|
23 |
+
# HTML processing
|
24 |
+
html2text==2020.1.16
|
25 |
+
|
26 |
+
# Async and concurrency
|
27 |
+
anyio==3.6.2
|
28 |
+
asyncio==3.4.3
|
29 |
+
|
30 |
+
# Utilities
|
31 |
+
python-dateutil==2.8.2
|
32 |
+
pytz==2023.3
|
33 |
+
retry==0.9.2
|
34 |
+
cryptography==40.0.1
|
35 |
+
cachetools==5.3.0
|
36 |
+
|
37 |
+
# Added from the code block
|
38 |
+
openai==1.12.0
|
39 |
+
gradio==4.16.0
|
40 |
+
chardet==5.2.0
|
41 |
+
|
42 |
+
# Dotenv
|
43 |
+
python-dotenv
|
robots.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Robots.txt handler for web crawler
|
3 |
+
"""
|
4 |
+
|
5 |
+
import time
|
6 |
+
import logging
|
7 |
+
import requests
|
8 |
+
from urllib.parse import urlparse, urljoin
|
9 |
+
from typing import Dict, Optional, Tuple
|
10 |
+
import tldextract
|
11 |
+
from datetime import datetime, timedelta
|
12 |
+
from cachetools import TTLCache
|
13 |
+
import robotexclusionrulesparser
|
14 |
+
|
15 |
+
from models import RobotsInfo
|
16 |
+
import config
|
17 |
+
|
18 |
+
# Import local configuration if available
|
19 |
+
try:
|
20 |
+
import local_config
|
21 |
+
# Override config settings with local settings
|
22 |
+
for key in dir(local_config):
|
23 |
+
if key.isupper():
|
24 |
+
setattr(config, key, getattr(local_config, key))
|
25 |
+
logging.info("Loaded local configuration")
|
26 |
+
except ImportError:
|
27 |
+
pass
|
28 |
+
|
29 |
+
# Configure logging
|
30 |
+
logging.basicConfig(
|
31 |
+
level=getattr(logging, config.LOG_LEVEL),
|
32 |
+
format=config.LOG_FORMAT
|
33 |
+
)
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
|
36 |
+
|
37 |
+
class RobotsHandler:
|
38 |
+
"""Handles robots.txt fetching and parsing"""
|
39 |
+
|
40 |
+
def __init__(self, user_agent: Optional[str] = None, cache_size: int = 1000, cache_ttl: int = 3600):
|
41 |
+
"""
|
42 |
+
Initialize robots handler
|
43 |
+
|
44 |
+
Args:
|
45 |
+
user_agent: User agent to use when fetching robots.txt
|
46 |
+
cache_size: Maximum number of robots.txt rules to cache
|
47 |
+
cache_ttl: Time to live for cache entries in seconds
|
48 |
+
"""
|
49 |
+
self.user_agent = user_agent or config.USER_AGENT
|
50 |
+
self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()
|
51 |
+
|
52 |
+
# Cache of robots.txt rules for domains
|
53 |
+
self.robots_cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
|
54 |
+
|
55 |
+
# Create request session
|
56 |
+
self.session = requests.Session()
|
57 |
+
self.session.headers.update({'User-Agent': self.user_agent})
|
58 |
+
|
59 |
+
def can_fetch(self, url: str) -> Tuple[bool, Optional[float]]:
|
60 |
+
"""
|
61 |
+
Check if URL can be fetched according to robots.txt
|
62 |
+
|
63 |
+
Args:
|
64 |
+
url: URL to check
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
Tuple of (can_fetch, crawl_delay), where crawl_delay is in seconds
|
68 |
+
"""
|
69 |
+
try:
|
70 |
+
parsed = urlparse(url)
|
71 |
+
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
72 |
+
domain = self._get_domain(url)
|
73 |
+
|
74 |
+
# Check if robots info is in cache
|
75 |
+
robots_info = self._get_robots_info(base_url, domain)
|
76 |
+
|
77 |
+
# Check if allowed
|
78 |
+
path = parsed.path or "/"
|
79 |
+
allowed = robots_info.allowed
|
80 |
+
if allowed:
|
81 |
+
allowed = self.parser.is_allowed(self.user_agent, path)
|
82 |
+
|
83 |
+
# Get crawl delay
|
84 |
+
crawl_delay = robots_info.crawl_delay
|
85 |
+
if not crawl_delay and hasattr(self.parser, 'get_crawl_delay'):
|
86 |
+
try:
|
87 |
+
crawl_delay = float(self.parser.get_crawl_delay(self.user_agent) or 0)
|
88 |
+
except:
|
89 |
+
crawl_delay = 0
|
90 |
+
|
91 |
+
return allowed, crawl_delay
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
logger.warning(f"Error checking robots.txt for {url}: {e}")
|
95 |
+
# In case of error, assume allowed
|
96 |
+
return True, None
|
97 |
+
|
98 |
+
def _get_robots_info(self, base_url: str, domain: str) -> RobotsInfo:
|
99 |
+
"""
|
100 |
+
Get robots.txt info for a domain
|
101 |
+
|
102 |
+
Args:
|
103 |
+
base_url: Base URL of the domain
|
104 |
+
domain: Domain name
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
RobotsInfo object
|
108 |
+
"""
|
109 |
+
# Check if in cache
|
110 |
+
if domain in self.robots_cache:
|
111 |
+
return self.robots_cache[domain]
|
112 |
+
|
113 |
+
# Fetch robots.txt
|
114 |
+
robots_url = urljoin(base_url, "/robots.txt")
|
115 |
+
try:
|
116 |
+
response = self.session.get(
|
117 |
+
robots_url,
|
118 |
+
timeout=config.CRAWL_TIMEOUT,
|
119 |
+
allow_redirects=True
|
120 |
+
)
|
121 |
+
|
122 |
+
status_code = response.status_code
|
123 |
+
|
124 |
+
# If robots.txt exists
|
125 |
+
if status_code == 200:
|
126 |
+
# Parse robots.txt
|
127 |
+
self.parser.parse(response.text)
|
128 |
+
|
129 |
+
# Create simpler user agents info that doesn't depend on get_user_agents
|
130 |
+
user_agents = {}
|
131 |
+
# Just store info for our specific user agent
|
132 |
+
crawl_delay = None
|
133 |
+
if hasattr(self.parser, 'get_crawl_delay'):
|
134 |
+
try:
|
135 |
+
crawl_delay = self.parser.get_crawl_delay(self.user_agent)
|
136 |
+
except:
|
137 |
+
crawl_delay = None
|
138 |
+
|
139 |
+
user_agents[self.user_agent] = {
|
140 |
+
'crawl_delay': crawl_delay
|
141 |
+
}
|
142 |
+
|
143 |
+
# Create robots info
|
144 |
+
robots_info = RobotsInfo(
|
145 |
+
domain=domain,
|
146 |
+
allowed=True,
|
147 |
+
crawl_delay=crawl_delay,
|
148 |
+
last_fetched=datetime.now(),
|
149 |
+
user_agents=user_agents,
|
150 |
+
status_code=status_code
|
151 |
+
)
|
152 |
+
else:
|
153 |
+
# If no robots.txt or error, assume allowed
|
154 |
+
self.parser.parse("") # Parse empty robots.txt
|
155 |
+
robots_info = RobotsInfo(
|
156 |
+
domain=domain,
|
157 |
+
allowed=True,
|
158 |
+
crawl_delay=None,
|
159 |
+
last_fetched=datetime.now(),
|
160 |
+
user_agents={},
|
161 |
+
status_code=status_code
|
162 |
+
)
|
163 |
+
|
164 |
+
# Cache robots info
|
165 |
+
self.robots_cache[domain] = robots_info
|
166 |
+
return robots_info
|
167 |
+
|
168 |
+
except requests.RequestException as e:
|
169 |
+
logger.warning(f"Error fetching robots.txt from {robots_url}: {e}")
|
170 |
+
|
171 |
+
# In case of error, assume allowed
|
172 |
+
self.parser.parse("") # Parse empty robots.txt
|
173 |
+
robots_info = RobotsInfo(
|
174 |
+
domain=domain,
|
175 |
+
allowed=True,
|
176 |
+
crawl_delay=None,
|
177 |
+
last_fetched=datetime.now(),
|
178 |
+
user_agents={},
|
179 |
+
status_code=None
|
180 |
+
)
|
181 |
+
|
182 |
+
# Cache robots info
|
183 |
+
self.robots_cache[domain] = robots_info
|
184 |
+
return robots_info
|
185 |
+
|
186 |
+
def _get_domain(self, url: str) -> str:
|
187 |
+
"""Extract domain from URL"""
|
188 |
+
parsed = tldextract.extract(url)
|
189 |
+
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
190 |
+
|
191 |
+
def clear_cache(self) -> None:
|
192 |
+
"""Clear the robots.txt cache"""
|
193 |
+
self.robots_cache.clear()
|
194 |
+
|
195 |
+
def update_cache(self, domain: str) -> None:
|
196 |
+
"""
|
197 |
+
Force update of a domain's robots.txt in the cache
|
198 |
+
|
199 |
+
Args:
|
200 |
+
domain: Domain to update
|
201 |
+
"""
|
202 |
+
if domain in self.robots_cache:
|
203 |
+
del self.robots_cache[domain]
|
run_crawler.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Main script to run the web crawler with command line arguments
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import time
|
9 |
+
import logging
|
10 |
+
import argparse
|
11 |
+
import signal
|
12 |
+
from urllib.parse import urlparse
|
13 |
+
|
14 |
+
# Add the current directory to path if needed
|
15 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
16 |
+
if script_dir not in sys.path:
|
17 |
+
sys.path.insert(0, script_dir)
|
18 |
+
|
19 |
+
# Configure logging - do this first
|
20 |
+
logging.basicConfig(
|
21 |
+
level=logging.INFO,
|
22 |
+
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
23 |
+
handlers=[
|
24 |
+
logging.StreamHandler(sys.stdout),
|
25 |
+
logging.FileHandler(os.path.join(script_dir, 'crawler.log'))
|
26 |
+
]
|
27 |
+
)
|
28 |
+
logger = logging.getLogger("run_crawler")
|
29 |
+
|
30 |
+
# Now import the crawler components
|
31 |
+
logger.info("Importing crawler modules...")
|
32 |
+
try:
|
33 |
+
from crawler import Crawler
|
34 |
+
from models import Priority
|
35 |
+
logger.info("Successfully imported crawler modules")
|
36 |
+
except Exception as e:
|
37 |
+
logger.error(f"Error importing crawler modules: {e}", exc_info=True)
|
38 |
+
sys.exit(1)
|
39 |
+
|
40 |
+
def parse_arguments():
|
41 |
+
"""Parse command line arguments"""
|
42 |
+
parser = argparse.ArgumentParser(description='Run the web crawler with custom settings')
|
43 |
+
|
44 |
+
parser.add_argument('--seed', nargs='+', metavar='URL',
|
45 |
+
help='One or more seed URLs to start crawling')
|
46 |
+
|
47 |
+
parser.add_argument('--depth', type=int, default=None,
|
48 |
+
help='Maximum crawl depth')
|
49 |
+
|
50 |
+
parser.add_argument('--workers', type=int, default=None,
|
51 |
+
help='Number of worker threads')
|
52 |
+
|
53 |
+
parser.add_argument('--delay', type=float, default=None,
|
54 |
+
help='Delay between requests to the same domain (in seconds)')
|
55 |
+
|
56 |
+
parser.add_argument('--respect-robots', dest='respect_robots', action='store_true',
|
57 |
+
help='Respect robots.txt rules')
|
58 |
+
|
59 |
+
parser.add_argument('--ignore-robots', dest='respect_robots', action='store_false',
|
60 |
+
help='Ignore robots.txt rules')
|
61 |
+
|
62 |
+
parser.add_argument('--user-agent', type=str, default=None,
|
63 |
+
help='User agent to use for requests')
|
64 |
+
|
65 |
+
parser.add_argument('--async', dest='async_mode', action='store_true',
|
66 |
+
help='Use async mode for requests')
|
67 |
+
|
68 |
+
parser.add_argument('--domain-filter', type=str, default=None,
|
69 |
+
help='Only crawl URLs that match this domain')
|
70 |
+
|
71 |
+
parser.add_argument('--reset-db', action='store_true',
|
72 |
+
help='Reset MongoDB and flush Redis data before starting')
|
73 |
+
|
74 |
+
parser.add_argument('--verbose', action='store_true',
|
75 |
+
help='Enable verbose logging')
|
76 |
+
|
77 |
+
args = parser.parse_args()
|
78 |
+
|
79 |
+
# Set log level based on verbose flag
|
80 |
+
if args.verbose:
|
81 |
+
logger.setLevel(logging.DEBUG)
|
82 |
+
logger.debug("Verbose logging enabled")
|
83 |
+
|
84 |
+
return args
|
85 |
+
|
86 |
+
def reset_databases():
|
87 |
+
"""Reset MongoDB and flush Redis data"""
|
88 |
+
success = True
|
89 |
+
|
90 |
+
# Reset MongoDB
|
91 |
+
try:
|
92 |
+
logger.info("Starting MongoDB cleanup...")
|
93 |
+
from mongo_cleanup import cleanup_mongodb
|
94 |
+
mongo_success = cleanup_mongodb()
|
95 |
+
if not mongo_success:
|
96 |
+
logger.warning("MongoDB cleanup may not have been completely successful")
|
97 |
+
success = False
|
98 |
+
else:
|
99 |
+
logger.info("MongoDB cleanup completed successfully")
|
100 |
+
except Exception as e:
|
101 |
+
logger.error(f"Error cleaning up MongoDB: {e}", exc_info=True)
|
102 |
+
success = False
|
103 |
+
|
104 |
+
# Flush Redis
|
105 |
+
try:
|
106 |
+
logger.info("Starting Redis flush...")
|
107 |
+
import redis
|
108 |
+
logger.debug("Connecting to Redis to flush data...")
|
109 |
+
|
110 |
+
# Set a timeout for Redis connection
|
111 |
+
r = redis.Redis(host='localhost', port=6379, db=0, socket_timeout=5)
|
112 |
+
|
113 |
+
# Check if Redis is available
|
114 |
+
try:
|
115 |
+
logger.debug("Testing Redis connection...")
|
116 |
+
ping_result = r.ping()
|
117 |
+
logger.debug(f"Redis ping result: {ping_result}")
|
118 |
+
|
119 |
+
# If connection works, flush all data
|
120 |
+
logger.info("Flushing all Redis data...")
|
121 |
+
result = r.flushall()
|
122 |
+
logger.info(f"Redis flush result: {result}")
|
123 |
+
except redis.ConnectionError as e:
|
124 |
+
logger.error(f"Redis connection error: {e}")
|
125 |
+
success = False
|
126 |
+
except Exception as e:
|
127 |
+
logger.error(f"Error flushing Redis: {e}", exc_info=True)
|
128 |
+
success = False
|
129 |
+
|
130 |
+
return success
|
131 |
+
|
132 |
+
def setup_signal_handlers(crawler_instance):
|
133 |
+
"""Setup signal handlers for graceful shutdown"""
|
134 |
+
def signal_handler(sig, frame):
|
135 |
+
logger.info(f"Received signal {sig}, shutting down gracefully...")
|
136 |
+
if crawler_instance and crawler_instance.running:
|
137 |
+
logger.info("Stopping crawler...")
|
138 |
+
crawler_instance.stop()
|
139 |
+
sys.exit(0)
|
140 |
+
|
141 |
+
signal.signal(signal.SIGINT, signal_handler)
|
142 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
143 |
+
|
144 |
+
def run_crawler():
|
145 |
+
"""Run the crawler with command-line arguments"""
|
146 |
+
args = parse_arguments()
|
147 |
+
crawler = None
|
148 |
+
|
149 |
+
try:
|
150 |
+
logger.info("Starting the web crawler...")
|
151 |
+
|
152 |
+
# Reset database if requested
|
153 |
+
if args.reset_db:
|
154 |
+
logger.info("Resetting MongoDB and flushing Redis data...")
|
155 |
+
if not reset_databases():
|
156 |
+
logger.warning("Database reset was not completely successful")
|
157 |
+
|
158 |
+
# Create crawler instance
|
159 |
+
logger.info("Creating crawler instance...")
|
160 |
+
crawler = Crawler()
|
161 |
+
logger.info("Crawler instance created successfully")
|
162 |
+
|
163 |
+
# Setup signal handlers
|
164 |
+
setup_signal_handlers(crawler)
|
165 |
+
|
166 |
+
# Override settings from command line if provided
|
167 |
+
if args.depth is not None:
|
168 |
+
import config
|
169 |
+
config.MAX_DEPTH = args.depth
|
170 |
+
logger.info(f"Setting maximum depth to {args.depth}")
|
171 |
+
|
172 |
+
if args.delay is not None:
|
173 |
+
import config
|
174 |
+
config.DELAY_BETWEEN_REQUESTS = args.delay
|
175 |
+
logger.info(f"Setting delay between requests to {args.delay} seconds")
|
176 |
+
|
177 |
+
if args.respect_robots is not None:
|
178 |
+
import config
|
179 |
+
config.RESPECT_ROBOTS_TXT = args.respect_robots
|
180 |
+
logger.info(f"Respect robots.txt: {args.respect_robots}")
|
181 |
+
|
182 |
+
if args.user_agent is not None:
|
183 |
+
import config
|
184 |
+
config.USER_AGENT = args.user_agent
|
185 |
+
logger.info(f"Using user agent: {args.user_agent}")
|
186 |
+
|
187 |
+
# Add seed URLs if provided
|
188 |
+
if args.seed:
|
189 |
+
logger.info(f"Adding {len(args.seed)} seed URLs")
|
190 |
+
seed_urls = []
|
191 |
+
for url in args.seed:
|
192 |
+
if not (url.startswith('http://') or url.startswith('https://')):
|
193 |
+
url = 'https://' + url
|
194 |
+
seed_urls.append(url)
|
195 |
+
logger.debug(f"Added seed URL: {url}")
|
196 |
+
|
197 |
+
# Add the URLs to the frontier
|
198 |
+
logger.info("Adding seed URLs to frontier...")
|
199 |
+
added = crawler.add_seed_urls(seed_urls, Priority.VERY_HIGH)
|
200 |
+
logger.info(f"Successfully added {added} seed URLs to the frontier")
|
201 |
+
|
202 |
+
# Apply domain filter if provided
|
203 |
+
if args.domain_filter:
|
204 |
+
import config
|
205 |
+
|
206 |
+
# Allow both domain.com or http://domain.com formats
|
207 |
+
domain = args.domain_filter
|
208 |
+
if domain.startswith('http://') or domain.startswith('https://'):
|
209 |
+
domain = urlparse(domain).netloc
|
210 |
+
|
211 |
+
config.ALLOWED_DOMAINS = [domain]
|
212 |
+
logger.info(f"Filtering to domain: {domain}")
|
213 |
+
|
214 |
+
# Start the crawler
|
215 |
+
num_workers = args.workers if args.workers is not None else 4
|
216 |
+
|
217 |
+
logger.info(f"Starting crawler with {num_workers} workers...")
|
218 |
+
crawler.start(num_workers=num_workers, async_mode=args.async_mode)
|
219 |
+
# If we get here, crawler has finished or was stopped
|
220 |
+
logger.info("Crawler finished")
|
221 |
+
|
222 |
+
except KeyboardInterrupt:
|
223 |
+
logger.info("Crawler interrupted by user")
|
224 |
+
if crawler and crawler.running:
|
225 |
+
logger.info("Stopping crawler...")
|
226 |
+
crawler.stop()
|
227 |
+
except Exception as e:
|
228 |
+
logger.error(f"Error running crawler: {e}", exc_info=True)
|
229 |
+
if crawler and crawler.running:
|
230 |
+
try:
|
231 |
+
logger.info("Attempting to stop crawler after error...")
|
232 |
+
crawler.stop()
|
233 |
+
except:
|
234 |
+
pass
|
235 |
+
|
236 |
+
if __name__ == "__main__":
|
237 |
+
run_crawler()
|
seo_analyzer_ui.py
ADDED
@@ -0,0 +1,708 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
SEO Analyzer UI using Gradio, Web Crawler, and OpenAI
|
3 |
+
"""
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import logging
|
7 |
+
import json
|
8 |
+
from typing import Dict, List, Any, Tuple, Optional
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
import tldextract
|
11 |
+
from openai import OpenAI
|
12 |
+
import time
|
13 |
+
import os
|
14 |
+
import threading
|
15 |
+
import queue
|
16 |
+
import shutil
|
17 |
+
import uuid
|
18 |
+
from concurrent.futures import ThreadPoolExecutor
|
19 |
+
from datetime import datetime
|
20 |
+
import tempfile
|
21 |
+
|
22 |
+
from crawler import Crawler
|
23 |
+
from frontier import URLFrontier
|
24 |
+
from models import URL, Page
|
25 |
+
import config
|
26 |
+
from run_crawler import reset_databases
|
27 |
+
from dotenv import load_dotenv, find_dotenv
|
28 |
+
|
29 |
+
load_dotenv(find_dotenv())
|
30 |
+
|
31 |
+
# Check if we're in deployment mode (e.g., Hugging Face Spaces)
|
32 |
+
IS_DEPLOYMENT = os.getenv('DEPLOYMENT', 'false').lower() == 'true'
|
33 |
+
|
34 |
+
# Custom CSS for better styling
|
35 |
+
CUSTOM_CSS = """
|
36 |
+
.container {
|
37 |
+
max-width: 1200px !important;
|
38 |
+
margin: auto;
|
39 |
+
padding: 20px;
|
40 |
+
}
|
41 |
+
|
42 |
+
.header {
|
43 |
+
text-align: center;
|
44 |
+
margin-bottom: 2rem;
|
45 |
+
}
|
46 |
+
|
47 |
+
.header h1 {
|
48 |
+
color: #2d3748;
|
49 |
+
font-size: 2.5rem;
|
50 |
+
font-weight: 700;
|
51 |
+
margin-bottom: 1rem;
|
52 |
+
}
|
53 |
+
|
54 |
+
.header p {
|
55 |
+
color: #4a5568;
|
56 |
+
font-size: 1.1rem;
|
57 |
+
max-width: 800px;
|
58 |
+
margin: 0 auto;
|
59 |
+
}
|
60 |
+
|
61 |
+
.input-section {
|
62 |
+
background: #f7fafc;
|
63 |
+
border-radius: 12px;
|
64 |
+
padding: 24px;
|
65 |
+
margin-bottom: 24px;
|
66 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
67 |
+
}
|
68 |
+
|
69 |
+
.analysis-section {
|
70 |
+
background: white;
|
71 |
+
border-radius: 12px;
|
72 |
+
padding: 24px;
|
73 |
+
margin-top: 24px;
|
74 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
75 |
+
}
|
76 |
+
|
77 |
+
.log-section {
|
78 |
+
font-family: monospace;
|
79 |
+
background: #1a202c;
|
80 |
+
color: #e2e8f0;
|
81 |
+
padding: 16px;
|
82 |
+
border-radius: 8px;
|
83 |
+
margin-top: 24px;
|
84 |
+
}
|
85 |
+
|
86 |
+
/* Custom styling for inputs */
|
87 |
+
.input-container {
|
88 |
+
background: white;
|
89 |
+
padding: 16px;
|
90 |
+
border-radius: 8px;
|
91 |
+
margin-bottom: 16px;
|
92 |
+
}
|
93 |
+
|
94 |
+
/* Custom styling for the slider */
|
95 |
+
.slider-container {
|
96 |
+
padding: 12px;
|
97 |
+
background: white;
|
98 |
+
border-radius: 8px;
|
99 |
+
}
|
100 |
+
|
101 |
+
/* Custom styling for buttons */
|
102 |
+
.primary-button {
|
103 |
+
background: #4299e1 !important;
|
104 |
+
color: white !important;
|
105 |
+
padding: 12px 24px !important;
|
106 |
+
border-radius: 8px !important;
|
107 |
+
font-weight: 600 !important;
|
108 |
+
transition: all 0.3s ease !important;
|
109 |
+
}
|
110 |
+
|
111 |
+
.primary-button:hover {
|
112 |
+
background: #3182ce !important;
|
113 |
+
transform: translateY(-1px) !important;
|
114 |
+
}
|
115 |
+
|
116 |
+
/* Markdown output styling */
|
117 |
+
.markdown-output {
|
118 |
+
font-family: system-ui, -apple-system, sans-serif;
|
119 |
+
line-height: 1.6;
|
120 |
+
}
|
121 |
+
|
122 |
+
.markdown-output h1 {
|
123 |
+
color: #2d3748;
|
124 |
+
border-bottom: 2px solid #e2e8f0;
|
125 |
+
padding-bottom: 0.5rem;
|
126 |
+
}
|
127 |
+
|
128 |
+
.markdown-output h2 {
|
129 |
+
color: #4a5568;
|
130 |
+
margin-top: 2rem;
|
131 |
+
}
|
132 |
+
|
133 |
+
.markdown-output h3 {
|
134 |
+
color: #718096;
|
135 |
+
margin-top: 1.5rem;
|
136 |
+
}
|
137 |
+
|
138 |
+
/* Progress bar styling */
|
139 |
+
.progress-bar {
|
140 |
+
height: 8px !important;
|
141 |
+
border-radius: 4px !important;
|
142 |
+
background: #ebf8ff !important;
|
143 |
+
}
|
144 |
+
|
145 |
+
.progress-bar-fill {
|
146 |
+
background: #4299e1 !important;
|
147 |
+
border-radius: 4px !important;
|
148 |
+
}
|
149 |
+
|
150 |
+
/* Add some spacing between sections */
|
151 |
+
.gap {
|
152 |
+
margin: 2rem 0;
|
153 |
+
}
|
154 |
+
"""
|
155 |
+
|
156 |
+
# Create a custom handler that will store logs in a queue
|
157 |
+
class QueueHandler(logging.Handler):
|
158 |
+
def __init__(self, log_queue):
|
159 |
+
super().__init__()
|
160 |
+
self.log_queue = log_queue
|
161 |
+
|
162 |
+
def emit(self, record):
|
163 |
+
log_entry = self.format(record)
|
164 |
+
try:
|
165 |
+
self.log_queue.put_nowait(f"{datetime.now().strftime('%H:%M:%S')} - {log_entry}")
|
166 |
+
except queue.Full:
|
167 |
+
pass # Ignore if queue is full
|
168 |
+
|
169 |
+
# Configure logging
|
170 |
+
logging.basicConfig(
|
171 |
+
level=getattr(logging, config.LOG_LEVEL),
|
172 |
+
format='%(levelname)s - %(message)s'
|
173 |
+
)
|
174 |
+
logger = logging.getLogger(__name__)
|
175 |
+
|
176 |
+
logger.info(f"IS_DEPLOYMENT: {IS_DEPLOYMENT}")
|
177 |
+
|
178 |
+
class InMemoryStorage:
|
179 |
+
"""Simple in-memory storage for deployment mode"""
|
180 |
+
def __init__(self):
|
181 |
+
self.urls = {}
|
182 |
+
self.pages = {}
|
183 |
+
|
184 |
+
def reset(self):
|
185 |
+
self.urls.clear()
|
186 |
+
self.pages.clear()
|
187 |
+
|
188 |
+
def add_url(self, url_obj):
|
189 |
+
self.urls[url_obj.url] = url_obj
|
190 |
+
|
191 |
+
def add_page(self, page_obj):
|
192 |
+
self.pages[page_obj.url] = page_obj
|
193 |
+
|
194 |
+
def get_url(self, url):
|
195 |
+
return self.urls.get(url)
|
196 |
+
|
197 |
+
def get_page(self, url):
|
198 |
+
return self.pages.get(url)
|
199 |
+
|
200 |
+
class SEOAnalyzer:
|
201 |
+
"""
|
202 |
+
SEO Analyzer that combines web crawler with OpenAI analysis
|
203 |
+
"""
|
204 |
+
|
205 |
+
def __init__(self, api_key: str):
|
206 |
+
"""Initialize SEO Analyzer"""
|
207 |
+
self.client = OpenAI(api_key=api_key)
|
208 |
+
self.crawler = None
|
209 |
+
self.crawled_pages = []
|
210 |
+
self.pages_crawled = 0
|
211 |
+
self.max_pages = 0
|
212 |
+
self.crawl_complete = threading.Event()
|
213 |
+
self.log_queue = queue.Queue(maxsize=1000)
|
214 |
+
self.session_id = str(uuid.uuid4())
|
215 |
+
self.storage = InMemoryStorage() if IS_DEPLOYMENT else None
|
216 |
+
|
217 |
+
# Add queue handler to logger
|
218 |
+
queue_handler = QueueHandler(self.log_queue)
|
219 |
+
queue_handler.setFormatter(logging.Formatter('%(levelname)s - %(message)s'))
|
220 |
+
logger.addHandler(queue_handler)
|
221 |
+
|
222 |
+
def _setup_session_storage(self) -> Tuple[str, str, str]:
|
223 |
+
"""
|
224 |
+
Set up session-specific storage directories
|
225 |
+
|
226 |
+
Returns:
|
227 |
+
Tuple of (storage_path, html_path, log_path)
|
228 |
+
"""
|
229 |
+
# Create session-specific paths
|
230 |
+
session_storage = os.path.join(config.STORAGE_PATH, self.session_id)
|
231 |
+
session_html = os.path.join(session_storage, "html")
|
232 |
+
session_logs = os.path.join(session_storage, "logs")
|
233 |
+
|
234 |
+
# Create directories
|
235 |
+
os.makedirs(session_storage, exist_ok=True)
|
236 |
+
os.makedirs(session_html, exist_ok=True)
|
237 |
+
os.makedirs(session_logs, exist_ok=True)
|
238 |
+
|
239 |
+
logger.info(f"Created session storage at {session_storage}")
|
240 |
+
return session_storage, session_html, session_logs
|
241 |
+
|
242 |
+
def _cleanup_session_storage(self):
|
243 |
+
"""Clean up session-specific storage"""
|
244 |
+
session_path = os.path.join(config.STORAGE_PATH, self.session_id)
|
245 |
+
try:
|
246 |
+
if os.path.exists(session_path):
|
247 |
+
shutil.rmtree(session_path)
|
248 |
+
logger.info(f"Cleaned up session storage at {session_path}")
|
249 |
+
except Exception as e:
|
250 |
+
logger.error(f"Error cleaning up session storage: {e}")
|
251 |
+
|
252 |
+
def _reset_storage(self):
|
253 |
+
"""Reset storage based on deployment mode"""
|
254 |
+
if IS_DEPLOYMENT:
|
255 |
+
self.storage.reset()
|
256 |
+
else:
|
257 |
+
reset_databases()
|
258 |
+
|
259 |
+
def analyze_website(self, url: str, max_pages: int = 10, progress: gr.Progress = gr.Progress()) -> Tuple[str, List[Dict], str]:
|
260 |
+
"""
|
261 |
+
Crawl website and analyze SEO using OpenAI
|
262 |
+
|
263 |
+
Args:
|
264 |
+
url: Seed URL to crawl
|
265 |
+
max_pages: Maximum number of pages to crawl
|
266 |
+
progress: Gradio progress indicator
|
267 |
+
|
268 |
+
Returns:
|
269 |
+
Tuple of (overall analysis, list of page-specific analyses, log output)
|
270 |
+
"""
|
271 |
+
try:
|
272 |
+
# Reset state
|
273 |
+
self.crawled_pages = []
|
274 |
+
self.pages_crawled = 0
|
275 |
+
self.max_pages = max_pages
|
276 |
+
self.crawl_complete.clear()
|
277 |
+
|
278 |
+
# Set up storage
|
279 |
+
if IS_DEPLOYMENT:
|
280 |
+
# Use temporary directory for file storage in deployment
|
281 |
+
temp_dir = tempfile.mkdtemp()
|
282 |
+
session_storage = temp_dir
|
283 |
+
session_html = os.path.join(temp_dir, "html")
|
284 |
+
session_logs = os.path.join(temp_dir, "logs")
|
285 |
+
os.makedirs(session_html, exist_ok=True)
|
286 |
+
os.makedirs(session_logs, exist_ok=True)
|
287 |
+
else:
|
288 |
+
session_storage, session_html, session_logs = self._setup_session_storage()
|
289 |
+
|
290 |
+
# Update config paths for this session
|
291 |
+
config.HTML_STORAGE_PATH = session_html
|
292 |
+
config.LOG_PATH = session_logs
|
293 |
+
|
294 |
+
# Clear log queue
|
295 |
+
while not self.log_queue.empty():
|
296 |
+
self.log_queue.get_nowait()
|
297 |
+
|
298 |
+
logger.info(f"Starting analysis of {url} with max_pages={max_pages}")
|
299 |
+
|
300 |
+
# Reset storage
|
301 |
+
logger.info("Resetting storage...")
|
302 |
+
self._reset_storage()
|
303 |
+
logger.info("Storage reset completed")
|
304 |
+
|
305 |
+
# Create new crawler instance with appropriate storage
|
306 |
+
logger.info("Creating crawler instance...")
|
307 |
+
if IS_DEPLOYMENT:
|
308 |
+
# In deployment mode, use in-memory storage
|
309 |
+
self.crawler = Crawler(storage=self.storage)
|
310 |
+
# Set frontier to use memory mode
|
311 |
+
self.crawler.frontier = URLFrontier(use_memory=True)
|
312 |
+
else:
|
313 |
+
# In local mode, use MongoDB and Redis
|
314 |
+
self.crawler = Crawler()
|
315 |
+
logger.info("Crawler instance created successfully")
|
316 |
+
|
317 |
+
# Extract domain for filtering
|
318 |
+
domain = self._extract_domain(url)
|
319 |
+
logger.info(f"Analyzing domain: {domain}")
|
320 |
+
|
321 |
+
# Add seed URL and configure domain filter
|
322 |
+
self.crawler.add_seed_urls([url])
|
323 |
+
config.ALLOWED_DOMAINS = [domain]
|
324 |
+
logger.info("Added seed URL and configured domain filter")
|
325 |
+
|
326 |
+
# Override the crawler's _process_url method to capture pages
|
327 |
+
original_process_url = self.crawler._process_url
|
328 |
+
def wrapped_process_url(url_obj):
|
329 |
+
if self.pages_crawled >= self.max_pages:
|
330 |
+
self.crawler.running = False # Signal crawler to stop
|
331 |
+
self.crawl_complete.set()
|
332 |
+
return
|
333 |
+
|
334 |
+
original_process_url(url_obj)
|
335 |
+
|
336 |
+
# Get the page based on storage mode
|
337 |
+
if IS_DEPLOYMENT:
|
338 |
+
# In deployment mode, get page from in-memory storage
|
339 |
+
page = self.storage.get_page(url_obj.url)
|
340 |
+
if page:
|
341 |
+
_, metadata = self.crawler.parser.parse(page)
|
342 |
+
self.crawled_pages.append({
|
343 |
+
'url': url_obj.url,
|
344 |
+
'content': page.content,
|
345 |
+
'metadata': metadata
|
346 |
+
})
|
347 |
+
self.pages_crawled += 1
|
348 |
+
logger.info(f"Crawled page {self.pages_crawled}/{max_pages}: {url_obj.url}")
|
349 |
+
else:
|
350 |
+
# In local mode, get page from MongoDB
|
351 |
+
page_data = self.crawler.pages_collection.find_one({'url': url_obj.url})
|
352 |
+
if page_data and page_data.get('content'):
|
353 |
+
_, metadata = self.crawler.parser.parse(Page(**page_data))
|
354 |
+
self.crawled_pages.append({
|
355 |
+
'url': url_obj.url,
|
356 |
+
'content': page_data['content'],
|
357 |
+
'metadata': metadata
|
358 |
+
})
|
359 |
+
self.pages_crawled += 1
|
360 |
+
logger.info(f"Crawled page {self.pages_crawled}/{max_pages}: {url_obj.url}")
|
361 |
+
|
362 |
+
if self.pages_crawled >= self.max_pages:
|
363 |
+
self.crawler.running = False # Signal crawler to stop
|
364 |
+
self.crawl_complete.set()
|
365 |
+
|
366 |
+
self.crawler._process_url = wrapped_process_url
|
367 |
+
|
368 |
+
def run_crawler():
|
369 |
+
try:
|
370 |
+
# Skip signal handler registration
|
371 |
+
self.crawler.running = True
|
372 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
373 |
+
try:
|
374 |
+
futures = [executor.submit(self.crawler._crawl_worker)]
|
375 |
+
for future in futures:
|
376 |
+
future.result()
|
377 |
+
except Exception as e:
|
378 |
+
logger.error(f"Error in crawler worker: {e}")
|
379 |
+
finally:
|
380 |
+
self.crawler.running = False
|
381 |
+
self.crawl_complete.set()
|
382 |
+
except Exception as e:
|
383 |
+
logger.error(f"Error in run_crawler: {e}")
|
384 |
+
self.crawl_complete.set()
|
385 |
+
|
386 |
+
# Start crawler in a thread
|
387 |
+
crawler_thread = threading.Thread(target=run_crawler)
|
388 |
+
crawler_thread.daemon = True
|
389 |
+
crawler_thread.start()
|
390 |
+
|
391 |
+
# Wait for completion or timeout with progress updates
|
392 |
+
timeout = 300 # 5 minutes
|
393 |
+
start_time = time.time()
|
394 |
+
last_progress = 0
|
395 |
+
while not self.crawl_complete.is_set() and time.time() - start_time < timeout:
|
396 |
+
current_progress = min(0.8, self.pages_crawled / max_pages)
|
397 |
+
if current_progress != last_progress:
|
398 |
+
progress(current_progress, f"Crawled {self.pages_crawled}/{max_pages} pages")
|
399 |
+
last_progress = current_progress
|
400 |
+
time.sleep(0.1) # More frequent updates
|
401 |
+
|
402 |
+
if time.time() - start_time >= timeout:
|
403 |
+
logger.warning("Crawler timed out")
|
404 |
+
self.crawler.running = False
|
405 |
+
|
406 |
+
# Wait for thread to finish
|
407 |
+
crawler_thread.join(timeout=10)
|
408 |
+
|
409 |
+
# Restore original method
|
410 |
+
self.crawler._process_url = original_process_url
|
411 |
+
|
412 |
+
# Collect all logs
|
413 |
+
logs = []
|
414 |
+
while not self.log_queue.empty():
|
415 |
+
logs.append(self.log_queue.get_nowait())
|
416 |
+
log_output = "\n".join(logs)
|
417 |
+
|
418 |
+
if not self.crawled_pages:
|
419 |
+
self._cleanup_session_storage()
|
420 |
+
return "No pages were successfully crawled.", [], log_output
|
421 |
+
|
422 |
+
logger.info("Starting OpenAI analysis...")
|
423 |
+
progress(0.9, "Analyzing crawled pages with OpenAI...")
|
424 |
+
|
425 |
+
# Analyze crawled pages with OpenAI
|
426 |
+
overall_analysis = self._get_overall_analysis(self.crawled_pages)
|
427 |
+
progress(0.95, "Generating page-specific analyses...")
|
428 |
+
page_analyses = self._get_page_analyses(self.crawled_pages)
|
429 |
+
|
430 |
+
logger.info("Analysis complete")
|
431 |
+
progress(1.0, "Analysis complete")
|
432 |
+
|
433 |
+
# Format the results
|
434 |
+
formatted_analysis = f"""
|
435 |
+
# SEO Analysis Report for {domain}
|
436 |
+
|
437 |
+
## Overall Analysis
|
438 |
+
{overall_analysis}
|
439 |
+
|
440 |
+
## Page-Specific Analyses
|
441 |
+
"""
|
442 |
+
for page_analysis in page_analyses:
|
443 |
+
formatted_analysis += f"""
|
444 |
+
### {page_analysis['url']}
|
445 |
+
{page_analysis['analysis']}
|
446 |
+
"""
|
447 |
+
|
448 |
+
# Clean up all resources
|
449 |
+
logger.info("Cleaning up resources...")
|
450 |
+
if IS_DEPLOYMENT:
|
451 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
452 |
+
self.storage.reset()
|
453 |
+
else:
|
454 |
+
self._cleanup_session_storage()
|
455 |
+
self._reset_storage()
|
456 |
+
logger.info("All resources cleaned up")
|
457 |
+
|
458 |
+
return formatted_analysis, page_analyses, log_output
|
459 |
+
|
460 |
+
except Exception as e:
|
461 |
+
logger.error(f"Error analyzing website: {e}")
|
462 |
+
# Clean up all resources even on error
|
463 |
+
if IS_DEPLOYMENT:
|
464 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
465 |
+
self.storage.reset()
|
466 |
+
else:
|
467 |
+
self._cleanup_session_storage()
|
468 |
+
self._reset_storage()
|
469 |
+
# Collect all logs
|
470 |
+
logs = []
|
471 |
+
while not self.log_queue.empty():
|
472 |
+
logs.append(self.log_queue.get_nowait())
|
473 |
+
log_output = "\n".join(logs)
|
474 |
+
return f"Error analyzing website: {str(e)}", [], log_output
|
475 |
+
|
476 |
+
def _extract_domain(self, url: str) -> str:
|
477 |
+
"""Extract domain from URL"""
|
478 |
+
extracted = tldextract.extract(url)
|
479 |
+
return f"{extracted.domain}.{extracted.suffix}"
|
480 |
+
|
481 |
+
def _get_overall_analysis(self, pages: List[Dict]) -> str:
|
482 |
+
"""Get overall SEO analysis using OpenAI"""
|
483 |
+
try:
|
484 |
+
# Prepare site overview for analysis
|
485 |
+
site_overview = {
|
486 |
+
'num_pages': len(pages),
|
487 |
+
'pages': [{
|
488 |
+
'url': page['url'],
|
489 |
+
'metadata': page['metadata']
|
490 |
+
} for page in pages]
|
491 |
+
}
|
492 |
+
|
493 |
+
# Create analysis prompt
|
494 |
+
prompt = f"""
|
495 |
+
You are an expert SEO consultant. Analyze this website's SEO based on the crawled data:
|
496 |
+
|
497 |
+
{json.dumps(site_overview, indent=2)}
|
498 |
+
|
499 |
+
Provide a comprehensive SEO analysis including:
|
500 |
+
1. Overall site structure and navigation
|
501 |
+
2. Common SEO issues across pages
|
502 |
+
3. Content quality and optimization
|
503 |
+
4. Technical SEO recommendations
|
504 |
+
5. Priority improvements
|
505 |
+
|
506 |
+
Format your response in Markdown.
|
507 |
+
"""
|
508 |
+
|
509 |
+
# Get analysis from OpenAI
|
510 |
+
response = self.client.chat.completions.create(
|
511 |
+
model="gpt-4o-mini",
|
512 |
+
messages=[
|
513 |
+
{"role": "system", "content": "You are an expert SEO consultant providing detailed website analysis."},
|
514 |
+
{"role": "user", "content": prompt}
|
515 |
+
],
|
516 |
+
temperature=0.7,
|
517 |
+
max_tokens=2000
|
518 |
+
)
|
519 |
+
|
520 |
+
return response.choices[0].message.content
|
521 |
+
|
522 |
+
except Exception as e:
|
523 |
+
logger.error(f"Error getting overall analysis: {e}")
|
524 |
+
return f"Error generating overall analysis: {str(e)}"
|
525 |
+
|
526 |
+
def _get_page_analyses(self, pages: List[Dict]) -> List[Dict]:
|
527 |
+
"""Get page-specific SEO analyses using OpenAI"""
|
528 |
+
page_analyses = []
|
529 |
+
|
530 |
+
for page in pages:
|
531 |
+
try:
|
532 |
+
# Create page analysis prompt
|
533 |
+
prompt = f"""
|
534 |
+
Analyze this page's SEO:
|
535 |
+
|
536 |
+
URL: {page['url']}
|
537 |
+
Metadata: {json.dumps(page['metadata'], indent=2)}
|
538 |
+
|
539 |
+
Provide specific recommendations for:
|
540 |
+
1. Title and meta description
|
541 |
+
2. Heading structure
|
542 |
+
3. Content optimization
|
543 |
+
4. Internal linking
|
544 |
+
5. Technical improvements
|
545 |
+
|
546 |
+
Format your response in Markdown.
|
547 |
+
"""
|
548 |
+
|
549 |
+
# Get analysis from OpenAI
|
550 |
+
response = self.client.chat.completions.create(
|
551 |
+
model="gpt-4o-mini",
|
552 |
+
messages=[
|
553 |
+
{"role": "system", "content": "You are an expert SEO consultant providing detailed page analysis."},
|
554 |
+
{"role": "user", "content": prompt}
|
555 |
+
],
|
556 |
+
temperature=0.7,
|
557 |
+
max_tokens=1000
|
558 |
+
)
|
559 |
+
|
560 |
+
page_analyses.append({
|
561 |
+
'url': page['url'],
|
562 |
+
'analysis': response.choices[0].message.content
|
563 |
+
})
|
564 |
+
|
565 |
+
# Sleep to respect rate limits
|
566 |
+
time.sleep(1)
|
567 |
+
|
568 |
+
except Exception as e:
|
569 |
+
logger.error(f"Error analyzing page {page['url']}: {e}")
|
570 |
+
page_analyses.append({
|
571 |
+
'url': page['url'],
|
572 |
+
'analysis': f"Error analyzing page: {str(e)}"
|
573 |
+
})
|
574 |
+
|
575 |
+
return page_analyses
|
576 |
+
|
577 |
+
def create_ui() -> gr.Interface:
|
578 |
+
"""Create Gradio interface"""
|
579 |
+
|
580 |
+
def analyze(url: str, api_key: str, max_pages: int, progress: gr.Progress = gr.Progress()) -> Tuple[str, str]:
|
581 |
+
"""Gradio interface function"""
|
582 |
+
try:
|
583 |
+
# Initialize analyzer
|
584 |
+
analyzer = SEOAnalyzer(api_key)
|
585 |
+
|
586 |
+
# Run analysis with progress updates
|
587 |
+
analysis, _, logs = analyzer.analyze_website(url, max_pages, progress)
|
588 |
+
|
589 |
+
# Collect all logs
|
590 |
+
log_output = ""
|
591 |
+
while not analyzer.log_queue.empty():
|
592 |
+
try:
|
593 |
+
log_output += analyzer.log_queue.get_nowait() + "\n"
|
594 |
+
except queue.Empty:
|
595 |
+
break
|
596 |
+
|
597 |
+
# Set progress to complete
|
598 |
+
progress(1.0, "Analysis complete")
|
599 |
+
|
600 |
+
# Return results
|
601 |
+
return analysis, log_output
|
602 |
+
|
603 |
+
except Exception as e:
|
604 |
+
error_msg = f"Error: {str(e)}"
|
605 |
+
logger.error(error_msg)
|
606 |
+
return error_msg, error_msg
|
607 |
+
|
608 |
+
# Create markdown content for the about section
|
609 |
+
about_markdown = """
|
610 |
+
# 🔍 SEO Analyzer Pro
|
611 |
+
|
612 |
+
Analyze your website's SEO performance using advanced crawling and AI technology.
|
613 |
+
|
614 |
+
### Features:
|
615 |
+
- 🕷️ Intelligent Web Crawling
|
616 |
+
- 🧠 AI-Powered Analysis
|
617 |
+
- 📊 Comprehensive Reports
|
618 |
+
- 🚀 Performance Insights
|
619 |
+
|
620 |
+
### How to Use:
|
621 |
+
1. Enter your website URL
|
622 |
+
2. Provide your OpenAI API key
|
623 |
+
3. Choose how many pages to analyze
|
624 |
+
4. Click Analyze and watch the magic happen!
|
625 |
+
|
626 |
+
### What You'll Get:
|
627 |
+
- Detailed SEO analysis
|
628 |
+
- Content quality assessment
|
629 |
+
- Technical recommendations
|
630 |
+
- Performance insights
|
631 |
+
- Actionable improvements
|
632 |
+
"""
|
633 |
+
|
634 |
+
# Create the interface with custom styling
|
635 |
+
with gr.Blocks(css=CUSTOM_CSS) as iface:
|
636 |
+
gr.Markdown(about_markdown)
|
637 |
+
|
638 |
+
with gr.Row():
|
639 |
+
with gr.Column(scale=2):
|
640 |
+
with gr.Group(elem_classes="input-section"):
|
641 |
+
gr.Markdown("### 📝 Enter Website Details")
|
642 |
+
url_input = gr.Textbox(
|
643 |
+
label="Website URL",
|
644 |
+
placeholder="https://example.com",
|
645 |
+
elem_classes="input-container",
|
646 |
+
info="Enter the full URL of the website you want to analyze (e.g., https://example.com)"
|
647 |
+
)
|
648 |
+
api_key = gr.Textbox(
|
649 |
+
label="OpenAI API Key",
|
650 |
+
placeholder="sk-...",
|
651 |
+
type="password",
|
652 |
+
elem_classes="input-container",
|
653 |
+
info="Your OpenAI API key is required for AI-powered analysis. Keep this secure!"
|
654 |
+
)
|
655 |
+
max_pages = gr.Slider(
|
656 |
+
minimum=1,
|
657 |
+
maximum=50,
|
658 |
+
value=10,
|
659 |
+
step=1,
|
660 |
+
label="Maximum Pages to Crawl",
|
661 |
+
elem_classes="slider-container",
|
662 |
+
info="Choose how many pages to analyze. More pages = more comprehensive analysis but takes longer"
|
663 |
+
)
|
664 |
+
analyze_btn = gr.Button(
|
665 |
+
"🔍 Analyze Website",
|
666 |
+
elem_classes="primary-button"
|
667 |
+
)
|
668 |
+
|
669 |
+
with gr.Row():
|
670 |
+
with gr.Column():
|
671 |
+
with gr.Group(elem_classes="analysis-section"):
|
672 |
+
gr.Markdown("### 📊 Analysis Results")
|
673 |
+
analysis_output = gr.Markdown(
|
674 |
+
label="SEO Analysis",
|
675 |
+
elem_classes="markdown-output"
|
676 |
+
)
|
677 |
+
|
678 |
+
with gr.Row():
|
679 |
+
with gr.Column():
|
680 |
+
with gr.Group(elem_classes="log-section"):
|
681 |
+
gr.Markdown("### 📋 Process Logs")
|
682 |
+
logs_output = gr.Textbox(
|
683 |
+
label="Logs",
|
684 |
+
lines=10,
|
685 |
+
elem_classes="log-output"
|
686 |
+
)
|
687 |
+
|
688 |
+
# Connect the button click to the analyze function
|
689 |
+
analyze_btn.click(
|
690 |
+
fn=analyze,
|
691 |
+
inputs=[url_input, api_key, max_pages],
|
692 |
+
outputs=[analysis_output, logs_output],
|
693 |
+
)
|
694 |
+
|
695 |
+
return iface
|
696 |
+
|
697 |
+
if __name__ == "__main__":
|
698 |
+
# Create base storage directory if it doesn't exist
|
699 |
+
os.makedirs(config.STORAGE_PATH, exist_ok=True)
|
700 |
+
|
701 |
+
# Create and launch UI
|
702 |
+
ui = create_ui()
|
703 |
+
ui.launch(
|
704 |
+
share=False,
|
705 |
+
server_name="0.0.0.0",
|
706 |
+
show_api=False,
|
707 |
+
show_error=True,
|
708 |
+
)
|
storage.py
ADDED
@@ -0,0 +1,888 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Storage component for the web crawler.
|
3 |
+
|
4 |
+
Handles storing and retrieving crawled web pages using:
|
5 |
+
1. MongoDB for metadata, URL information, and crawl stats
|
6 |
+
2. Disk-based storage for HTML content
|
7 |
+
3. Optional Amazon S3 integration for scalable storage
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
import logging
|
12 |
+
import time
|
13 |
+
import datetime
|
14 |
+
import hashlib
|
15 |
+
import json
|
16 |
+
import gzip
|
17 |
+
import shutil
|
18 |
+
from typing import Dict, List, Optional, Union, Any, Tuple
|
19 |
+
from urllib.parse import urlparse
|
20 |
+
import pymongo
|
21 |
+
from pymongo import MongoClient, UpdateOne
|
22 |
+
from pymongo.errors import PyMongoError, BulkWriteError
|
23 |
+
import boto3
|
24 |
+
from botocore.exceptions import ClientError
|
25 |
+
|
26 |
+
from models import Page, URL
|
27 |
+
import config
|
28 |
+
|
29 |
+
# Configure logging
|
30 |
+
logging.basicConfig(
|
31 |
+
level=getattr(logging, config.LOG_LEVEL),
|
32 |
+
format=config.LOG_FORMAT
|
33 |
+
)
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
|
36 |
+
|
37 |
+
class StorageManager:
|
38 |
+
"""
|
39 |
+
Storage manager for web crawler data
|
40 |
+
|
41 |
+
Handles:
|
42 |
+
- MongoDB for metadata, URL information, and stats
|
43 |
+
- Disk-based storage for HTML content
|
44 |
+
- Optional Amazon S3 integration
|
45 |
+
"""
|
46 |
+
|
47 |
+
def __init__(self,
|
48 |
+
mongo_uri: Optional[str] = None,
|
49 |
+
use_s3: bool = False,
|
50 |
+
compress_html: bool = True,
|
51 |
+
max_disk_usage_gb: float = 100.0):
|
52 |
+
"""
|
53 |
+
Initialize the storage manager
|
54 |
+
|
55 |
+
Args:
|
56 |
+
mongo_uri: MongoDB connection URI
|
57 |
+
use_s3: Whether to use Amazon S3 for HTML storage
|
58 |
+
compress_html: Whether to compress HTML content
|
59 |
+
max_disk_usage_gb: Maximum disk space to use in GB
|
60 |
+
"""
|
61 |
+
self.mongo_uri = mongo_uri or config.MONGODB_URI
|
62 |
+
self.use_s3 = use_s3
|
63 |
+
self.compress_html = compress_html
|
64 |
+
self.max_disk_usage_gb = max_disk_usage_gb
|
65 |
+
|
66 |
+
# Connect to MongoDB
|
67 |
+
self.mongo_client = MongoClient(self.mongo_uri)
|
68 |
+
self.db = self.mongo_client[config.MONGODB_DB]
|
69 |
+
|
70 |
+
# MongoDB collections
|
71 |
+
self.pages_collection = self.db['pages']
|
72 |
+
self.urls_collection = self.db['urls']
|
73 |
+
self.stats_collection = self.db['stats']
|
74 |
+
|
75 |
+
# Create necessary indexes
|
76 |
+
self._create_indexes()
|
77 |
+
|
78 |
+
# S3 client (if enabled)
|
79 |
+
self.s3_client = None
|
80 |
+
if self.use_s3:
|
81 |
+
self._init_s3_client()
|
82 |
+
|
83 |
+
# Ensure storage directories exist
|
84 |
+
self._ensure_directories()
|
85 |
+
|
86 |
+
# Bulk operation buffers
|
87 |
+
self.page_buffer = []
|
88 |
+
self.url_buffer = []
|
89 |
+
self.max_buffer_size = 100
|
90 |
+
|
91 |
+
# Statistics
|
92 |
+
self.stats = {
|
93 |
+
'pages_stored': 0,
|
94 |
+
'pages_retrieved': 0,
|
95 |
+
'urls_stored': 0,
|
96 |
+
'urls_retrieved': 0,
|
97 |
+
'disk_space_used': 0,
|
98 |
+
's3_objects_stored': 0,
|
99 |
+
'mongodb_size': 0,
|
100 |
+
'storage_errors': 0,
|
101 |
+
'start_time': time.time()
|
102 |
+
}
|
103 |
+
|
104 |
+
def _create_indexes(self) -> None:
|
105 |
+
"""Create necessary indexes in MongoDB collections"""
|
106 |
+
try:
|
107 |
+
# Pages collection indexes
|
108 |
+
self.pages_collection.create_index('url', unique=True)
|
109 |
+
self.pages_collection.create_index('content_hash')
|
110 |
+
self.pages_collection.create_index('crawled_at')
|
111 |
+
self.pages_collection.create_index('domain')
|
112 |
+
|
113 |
+
# URLs collection indexes
|
114 |
+
self.urls_collection.create_index('url', unique=True)
|
115 |
+
self.urls_collection.create_index('normalized_url')
|
116 |
+
self.urls_collection.create_index('domain')
|
117 |
+
self.urls_collection.create_index('status')
|
118 |
+
self.urls_collection.create_index('priority')
|
119 |
+
self.urls_collection.create_index('last_crawled')
|
120 |
+
|
121 |
+
logger.info("MongoDB indexes created")
|
122 |
+
except PyMongoError as e:
|
123 |
+
logger.error(f"Error creating MongoDB indexes: {e}")
|
124 |
+
self.stats['storage_errors'] += 1
|
125 |
+
|
126 |
+
def _init_s3_client(self) -> None:
|
127 |
+
"""Initialize AWS S3 client"""
|
128 |
+
try:
|
129 |
+
self.s3_client = boto3.client(
|
130 |
+
's3',
|
131 |
+
aws_access_key_id=config.AWS_ACCESS_KEY,
|
132 |
+
aws_secret_access_key=config.AWS_SECRET_KEY,
|
133 |
+
region_name=config.AWS_REGION
|
134 |
+
)
|
135 |
+
logger.info("S3 client initialized")
|
136 |
+
|
137 |
+
# Create bucket if it doesn't exist
|
138 |
+
self._ensure_s3_bucket()
|
139 |
+
except Exception as e:
|
140 |
+
logger.error(f"Error initializing S3 client: {e}")
|
141 |
+
self.use_s3 = False
|
142 |
+
self.stats['storage_errors'] += 1
|
143 |
+
|
144 |
+
def _ensure_s3_bucket(self) -> None:
|
145 |
+
"""Create S3 bucket if it doesn't exist"""
|
146 |
+
if not self.s3_client:
|
147 |
+
return
|
148 |
+
|
149 |
+
try:
|
150 |
+
# Check if bucket exists
|
151 |
+
self.s3_client.head_bucket(Bucket=config.S3_BUCKET)
|
152 |
+
logger.info(f"S3 bucket '{config.S3_BUCKET}' exists")
|
153 |
+
except ClientError as e:
|
154 |
+
error_code = e.response.get('Error', {}).get('Code')
|
155 |
+
|
156 |
+
if error_code == '404':
|
157 |
+
# Bucket doesn't exist, create it
|
158 |
+
try:
|
159 |
+
self.s3_client.create_bucket(
|
160 |
+
Bucket=config.S3_BUCKET,
|
161 |
+
CreateBucketConfiguration={
|
162 |
+
'LocationConstraint': config.AWS_REGION
|
163 |
+
}
|
164 |
+
)
|
165 |
+
logger.info(f"Created S3 bucket '{config.S3_BUCKET}'")
|
166 |
+
except ClientError as ce:
|
167 |
+
logger.error(f"Error creating S3 bucket: {ce}")
|
168 |
+
self.use_s3 = False
|
169 |
+
self.stats['storage_errors'] += 1
|
170 |
+
else:
|
171 |
+
logger.error(f"Error checking S3 bucket: {e}")
|
172 |
+
self.use_s3 = False
|
173 |
+
self.stats['storage_errors'] += 1
|
174 |
+
|
175 |
+
def _ensure_directories(self) -> None:
|
176 |
+
"""Ensure storage directories exist"""
|
177 |
+
# Create main storage directory
|
178 |
+
os.makedirs(config.STORAGE_PATH, exist_ok=True)
|
179 |
+
|
180 |
+
# Create HTML storage directory
|
181 |
+
os.makedirs(config.HTML_STORAGE_PATH, exist_ok=True)
|
182 |
+
|
183 |
+
# Create log directory
|
184 |
+
os.makedirs(config.LOG_PATH, exist_ok=True)
|
185 |
+
|
186 |
+
logger.info("Storage directories created")
|
187 |
+
|
188 |
+
def store_page(self, page: Page, flush: bool = False) -> bool:
|
189 |
+
"""
|
190 |
+
Store a crawled page
|
191 |
+
|
192 |
+
Args:
|
193 |
+
page: Page object to store
|
194 |
+
flush: Whether to flush page buffer immediately
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
True if successful, False otherwise
|
198 |
+
"""
|
199 |
+
try:
|
200 |
+
# Store page content based on configuration
|
201 |
+
if self.use_s3:
|
202 |
+
content_stored = self._store_content_s3(page)
|
203 |
+
else:
|
204 |
+
content_stored = self._store_content_disk(page)
|
205 |
+
|
206 |
+
if not content_stored:
|
207 |
+
logger.warning(f"Failed to store content for {page.url}")
|
208 |
+
self.stats['storage_errors'] += 1
|
209 |
+
return False
|
210 |
+
|
211 |
+
# Remove HTML content from page object for MongoDB storage
|
212 |
+
page_dict = page.dict(exclude={'content'})
|
213 |
+
|
214 |
+
# Convert datetime fields to proper format
|
215 |
+
if page.crawled_at:
|
216 |
+
page_dict['crawled_at'] = page.crawled_at
|
217 |
+
|
218 |
+
# Add to buffer
|
219 |
+
self.page_buffer.append(
|
220 |
+
UpdateOne(
|
221 |
+
{'url': page.url},
|
222 |
+
{'$set': page_dict},
|
223 |
+
upsert=True
|
224 |
+
)
|
225 |
+
)
|
226 |
+
|
227 |
+
# Update statistics
|
228 |
+
self.stats['pages_stored'] += 1
|
229 |
+
|
230 |
+
# Check if buffer should be flushed
|
231 |
+
if flush or len(self.page_buffer) >= self.max_buffer_size:
|
232 |
+
return self.flush_page_buffer()
|
233 |
+
|
234 |
+
return True
|
235 |
+
except Exception as e:
|
236 |
+
logger.error(f"Error storing page {page.url}: {e}")
|
237 |
+
self.stats['storage_errors'] += 1
|
238 |
+
return False
|
239 |
+
|
240 |
+
def _store_content_disk(self, page: Page) -> bool:
|
241 |
+
"""
|
242 |
+
Store page content on disk
|
243 |
+
|
244 |
+
Args:
|
245 |
+
page: Page to store
|
246 |
+
|
247 |
+
Returns:
|
248 |
+
True if successful, False otherwise
|
249 |
+
"""
|
250 |
+
try:
|
251 |
+
# Check disk space
|
252 |
+
if not self._check_disk_space():
|
253 |
+
logger.warning("Disk space limit exceeded")
|
254 |
+
return False
|
255 |
+
|
256 |
+
# Create directory for domain if it doesn't exist
|
257 |
+
domain = self._extract_domain(page.url)
|
258 |
+
domain_dir = os.path.join(config.HTML_STORAGE_PATH, domain)
|
259 |
+
os.makedirs(domain_dir, exist_ok=True)
|
260 |
+
|
261 |
+
# Create filename
|
262 |
+
filename = self._url_to_filename(page.url)
|
263 |
+
|
264 |
+
# Full path for the file
|
265 |
+
if self.compress_html:
|
266 |
+
filepath = os.path.join(domain_dir, f"{filename}.gz")
|
267 |
+
|
268 |
+
# Compress and write HTML to file
|
269 |
+
with gzip.open(filepath, 'wt', encoding='utf-8') as f:
|
270 |
+
f.write(page.content)
|
271 |
+
else:
|
272 |
+
filepath = os.path.join(domain_dir, f"{filename}.html")
|
273 |
+
|
274 |
+
# Write HTML to file
|
275 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
276 |
+
f.write(page.content)
|
277 |
+
|
278 |
+
# Update disk space used
|
279 |
+
file_size = os.path.getsize(filepath)
|
280 |
+
self.stats['disk_space_used'] += file_size
|
281 |
+
|
282 |
+
logger.debug(f"Stored HTML content for {page.url} at {filepath}")
|
283 |
+
return True
|
284 |
+
except Exception as e:
|
285 |
+
logger.error(f"Error storing content on disk for {page.url}: {e}")
|
286 |
+
self.stats['storage_errors'] += 1
|
287 |
+
return False
|
288 |
+
|
289 |
+
def _store_content_s3(self, page: Page) -> bool:
|
290 |
+
"""
|
291 |
+
Store page content in S3
|
292 |
+
|
293 |
+
Args:
|
294 |
+
page: Page to store
|
295 |
+
|
296 |
+
Returns:
|
297 |
+
True if successful, False otherwise
|
298 |
+
"""
|
299 |
+
if not self.s3_client:
|
300 |
+
logger.warning("S3 client not initialized, falling back to disk storage")
|
301 |
+
return self._store_content_disk(page)
|
302 |
+
|
303 |
+
try:
|
304 |
+
# Create key for S3 object
|
305 |
+
domain = self._extract_domain(page.url)
|
306 |
+
filename = self._url_to_filename(page.url)
|
307 |
+
|
308 |
+
# S3 key
|
309 |
+
s3_key = f"{domain}/{filename}"
|
310 |
+
if self.compress_html:
|
311 |
+
s3_key += ".gz"
|
312 |
+
|
313 |
+
# Compress content
|
314 |
+
content_bytes = gzip.compress(page.content.encode('utf-8'))
|
315 |
+
content_type = 'application/gzip'
|
316 |
+
else:
|
317 |
+
s3_key += ".html"
|
318 |
+
content_bytes = page.content.encode('utf-8')
|
319 |
+
content_type = 'text/html'
|
320 |
+
|
321 |
+
# Upload to S3
|
322 |
+
self.s3_client.put_object(
|
323 |
+
Bucket=config.S3_BUCKET,
|
324 |
+
Key=s3_key,
|
325 |
+
Body=content_bytes,
|
326 |
+
ContentType=content_type,
|
327 |
+
Metadata={
|
328 |
+
'url': page.url,
|
329 |
+
'crawled_at': page.crawled_at.isoformat() if page.crawled_at else '',
|
330 |
+
'content_hash': page.content_hash or ''
|
331 |
+
}
|
332 |
+
)
|
333 |
+
|
334 |
+
# Update statistics
|
335 |
+
self.stats['s3_objects_stored'] += 1
|
336 |
+
|
337 |
+
logger.debug(f"Stored HTML content for {page.url} in S3 at {s3_key}")
|
338 |
+
return True
|
339 |
+
except Exception as e:
|
340 |
+
logger.error(f"Error storing content in S3 for {page.url}: {e}")
|
341 |
+
self.stats['storage_errors'] += 1
|
342 |
+
|
343 |
+
# Fall back to disk storage
|
344 |
+
logger.info(f"Falling back to disk storage for {page.url}")
|
345 |
+
return self._store_content_disk(page)
|
346 |
+
|
347 |
+
def store_url(self, url_obj: URL, flush: bool = False) -> bool:
|
348 |
+
"""
|
349 |
+
Store URL information
|
350 |
+
|
351 |
+
Args:
|
352 |
+
url_obj: URL object to store
|
353 |
+
flush: Whether to flush URL buffer immediately
|
354 |
+
|
355 |
+
Returns:
|
356 |
+
True if successful, False otherwise
|
357 |
+
"""
|
358 |
+
try:
|
359 |
+
# Convert URL object to dict
|
360 |
+
url_dict = url_obj.dict()
|
361 |
+
|
362 |
+
# Add to buffer
|
363 |
+
self.url_buffer.append(
|
364 |
+
UpdateOne(
|
365 |
+
{'url': url_obj.url},
|
366 |
+
{'$set': url_dict},
|
367 |
+
upsert=True
|
368 |
+
)
|
369 |
+
)
|
370 |
+
|
371 |
+
# Update statistics
|
372 |
+
self.stats['urls_stored'] += 1
|
373 |
+
|
374 |
+
# Check if buffer should be flushed
|
375 |
+
if flush or len(self.url_buffer) >= self.max_buffer_size:
|
376 |
+
return self.flush_url_buffer()
|
377 |
+
|
378 |
+
return True
|
379 |
+
except Exception as e:
|
380 |
+
logger.error(f"Error storing URL {url_obj.url}: {e}")
|
381 |
+
self.stats['storage_errors'] += 1
|
382 |
+
return False
|
383 |
+
|
384 |
+
def flush_page_buffer(self) -> bool:
|
385 |
+
"""
|
386 |
+
Flush page buffer to MongoDB
|
387 |
+
|
388 |
+
Returns:
|
389 |
+
True if successful, False otherwise
|
390 |
+
"""
|
391 |
+
if not self.page_buffer:
|
392 |
+
return True
|
393 |
+
|
394 |
+
try:
|
395 |
+
# Execute bulk operation
|
396 |
+
result = self.pages_collection.bulk_write(self.page_buffer, ordered=False)
|
397 |
+
|
398 |
+
# Clear buffer
|
399 |
+
buffer_size = len(self.page_buffer)
|
400 |
+
self.page_buffer = []
|
401 |
+
|
402 |
+
logger.debug(f"Flushed {buffer_size} pages to MongoDB")
|
403 |
+
return True
|
404 |
+
except BulkWriteError as e:
|
405 |
+
logger.error(f"Error in bulk write for pages: {e.details}")
|
406 |
+
self.stats['storage_errors'] += 1
|
407 |
+
|
408 |
+
# Clear buffer
|
409 |
+
self.page_buffer = []
|
410 |
+
return False
|
411 |
+
except Exception as e:
|
412 |
+
logger.error(f"Error flushing page buffer: {e}")
|
413 |
+
self.stats['storage_errors'] += 1
|
414 |
+
|
415 |
+
# Clear buffer
|
416 |
+
self.page_buffer = []
|
417 |
+
return False
|
418 |
+
|
419 |
+
def flush_url_buffer(self) -> bool:
|
420 |
+
"""
|
421 |
+
Flush URL buffer to MongoDB
|
422 |
+
|
423 |
+
Returns:
|
424 |
+
True if successful, False otherwise
|
425 |
+
"""
|
426 |
+
if not self.url_buffer:
|
427 |
+
return True
|
428 |
+
|
429 |
+
try:
|
430 |
+
# Execute bulk operation
|
431 |
+
result = self.urls_collection.bulk_write(self.url_buffer, ordered=False)
|
432 |
+
|
433 |
+
# Clear buffer
|
434 |
+
buffer_size = len(self.url_buffer)
|
435 |
+
self.url_buffer = []
|
436 |
+
|
437 |
+
logger.debug(f"Flushed {buffer_size} URLs to MongoDB")
|
438 |
+
return True
|
439 |
+
except BulkWriteError as e:
|
440 |
+
logger.error(f"Error in bulk write for URLs: {e.details}")
|
441 |
+
self.stats['storage_errors'] += 1
|
442 |
+
|
443 |
+
# Clear buffer
|
444 |
+
self.url_buffer = []
|
445 |
+
return False
|
446 |
+
except Exception as e:
|
447 |
+
logger.error(f"Error flushing URL buffer: {e}")
|
448 |
+
self.stats['storage_errors'] += 1
|
449 |
+
|
450 |
+
# Clear buffer
|
451 |
+
self.url_buffer = []
|
452 |
+
return False
|
453 |
+
|
454 |
+
def get_page(self, url: str) -> Optional[Page]:
|
455 |
+
"""
|
456 |
+
Retrieve a page by URL
|
457 |
+
|
458 |
+
Args:
|
459 |
+
url: URL of the page to retrieve
|
460 |
+
|
461 |
+
Returns:
|
462 |
+
Page object if found, None otherwise
|
463 |
+
"""
|
464 |
+
try:
|
465 |
+
# Get page metadata from MongoDB
|
466 |
+
page_doc = self.pages_collection.find_one({'url': url})
|
467 |
+
|
468 |
+
if not page_doc:
|
469 |
+
return None
|
470 |
+
|
471 |
+
# Create Page object from document
|
472 |
+
page = Page(**page_doc)
|
473 |
+
|
474 |
+
# Load content based on configuration
|
475 |
+
if self.use_s3:
|
476 |
+
content = self._load_content_s3(url)
|
477 |
+
else:
|
478 |
+
content = self._load_content_disk(url)
|
479 |
+
|
480 |
+
if content:
|
481 |
+
page.content = content
|
482 |
+
|
483 |
+
# Update statistics
|
484 |
+
self.stats['pages_retrieved'] += 1
|
485 |
+
|
486 |
+
return page
|
487 |
+
except Exception as e:
|
488 |
+
logger.error(f"Error retrieving page {url}: {e}")
|
489 |
+
self.stats['storage_errors'] += 1
|
490 |
+
return None
|
491 |
+
|
492 |
+
def _load_content_disk(self, url: str) -> Optional[str]:
|
493 |
+
"""
|
494 |
+
Load page content from disk
|
495 |
+
|
496 |
+
Args:
|
497 |
+
url: URL of the page
|
498 |
+
|
499 |
+
Returns:
|
500 |
+
Page content if found, None otherwise
|
501 |
+
"""
|
502 |
+
try:
|
503 |
+
# Get domain and filename
|
504 |
+
domain = self._extract_domain(url)
|
505 |
+
filename = self._url_to_filename(url)
|
506 |
+
|
507 |
+
# Check for compressed file first
|
508 |
+
compressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.gz")
|
509 |
+
uncompressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.html")
|
510 |
+
|
511 |
+
if os.path.exists(compressed_path):
|
512 |
+
# Load compressed content
|
513 |
+
with gzip.open(compressed_path, 'rt', encoding='utf-8') as f:
|
514 |
+
return f.read()
|
515 |
+
elif os.path.exists(uncompressed_path):
|
516 |
+
# Load uncompressed content
|
517 |
+
with open(uncompressed_path, 'r', encoding='utf-8') as f:
|
518 |
+
return f.read()
|
519 |
+
else:
|
520 |
+
logger.warning(f"Content file not found for {url}")
|
521 |
+
return None
|
522 |
+
except Exception as e:
|
523 |
+
logger.error(f"Error loading content from disk for {url}: {e}")
|
524 |
+
self.stats['storage_errors'] += 1
|
525 |
+
return None
|
526 |
+
|
527 |
+
def _load_content_s3(self, url: str) -> Optional[str]:
|
528 |
+
"""
|
529 |
+
Load page content from S3
|
530 |
+
|
531 |
+
Args:
|
532 |
+
url: URL of the page
|
533 |
+
|
534 |
+
Returns:
|
535 |
+
Page content if found, None otherwise
|
536 |
+
"""
|
537 |
+
if not self.s3_client:
|
538 |
+
logger.warning("S3 client not initialized, falling back to disk loading")
|
539 |
+
return self._load_content_disk(url)
|
540 |
+
|
541 |
+
try:
|
542 |
+
# Get domain and filename
|
543 |
+
domain = self._extract_domain(url)
|
544 |
+
filename = self._url_to_filename(url)
|
545 |
+
|
546 |
+
# Try both compressed and uncompressed keys
|
547 |
+
s3_key_compressed = f"{domain}/{filename}.gz"
|
548 |
+
s3_key_uncompressed = f"{domain}/{filename}.html"
|
549 |
+
|
550 |
+
try:
|
551 |
+
# Try compressed file first
|
552 |
+
response = self.s3_client.get_object(
|
553 |
+
Bucket=config.S3_BUCKET,
|
554 |
+
Key=s3_key_compressed
|
555 |
+
)
|
556 |
+
|
557 |
+
# Decompress content
|
558 |
+
content_bytes = response['Body'].read()
|
559 |
+
return gzip.decompress(content_bytes).decode('utf-8')
|
560 |
+
except ClientError as e:
|
561 |
+
if e.response['Error']['Code'] == 'NoSuchKey':
|
562 |
+
# Try uncompressed file
|
563 |
+
try:
|
564 |
+
response = self.s3_client.get_object(
|
565 |
+
Bucket=config.S3_BUCKET,
|
566 |
+
Key=s3_key_uncompressed
|
567 |
+
)
|
568 |
+
content_bytes = response['Body'].read()
|
569 |
+
return content_bytes.decode('utf-8')
|
570 |
+
except ClientError as e2:
|
571 |
+
if e2.response['Error']['Code'] == 'NoSuchKey':
|
572 |
+
logger.warning(f"Content not found in S3 for {url}")
|
573 |
+
|
574 |
+
# Try loading from disk as fallback
|
575 |
+
return self._load_content_disk(url)
|
576 |
+
else:
|
577 |
+
raise e2
|
578 |
+
else:
|
579 |
+
raise e
|
580 |
+
except Exception as e:
|
581 |
+
logger.error(f"Error loading content from S3 for {url}: {e}")
|
582 |
+
self.stats['storage_errors'] += 1
|
583 |
+
|
584 |
+
# Try loading from disk as fallback
|
585 |
+
return self._load_content_disk(url)
|
586 |
+
|
587 |
+
def get_url(self, url: str) -> Optional[URL]:
|
588 |
+
"""
|
589 |
+
Retrieve URL information by URL
|
590 |
+
|
591 |
+
Args:
|
592 |
+
url: URL to retrieve
|
593 |
+
|
594 |
+
Returns:
|
595 |
+
URL object if found, None otherwise
|
596 |
+
"""
|
597 |
+
try:
|
598 |
+
# Get URL information from MongoDB
|
599 |
+
url_doc = self.urls_collection.find_one({'url': url})
|
600 |
+
|
601 |
+
if not url_doc:
|
602 |
+
return None
|
603 |
+
|
604 |
+
# Create URL object from document
|
605 |
+
url_obj = URL(**url_doc)
|
606 |
+
|
607 |
+
# Update statistics
|
608 |
+
self.stats['urls_retrieved'] += 1
|
609 |
+
|
610 |
+
return url_obj
|
611 |
+
except Exception as e:
|
612 |
+
logger.error(f"Error retrieving URL {url}: {e}")
|
613 |
+
self.stats['storage_errors'] += 1
|
614 |
+
return None
|
615 |
+
|
616 |
+
def get_urls_by_status(self, status: str, limit: int = 100) -> List[URL]:
|
617 |
+
"""
|
618 |
+
Retrieve URLs by status
|
619 |
+
|
620 |
+
Args:
|
621 |
+
status: Status of URLs to retrieve
|
622 |
+
limit: Maximum number of URLs to retrieve
|
623 |
+
|
624 |
+
Returns:
|
625 |
+
List of URL objects
|
626 |
+
"""
|
627 |
+
try:
|
628 |
+
# Get URLs from MongoDB
|
629 |
+
url_docs = list(self.urls_collection.find({'status': status}).limit(limit))
|
630 |
+
|
631 |
+
# Create URL objects from documents
|
632 |
+
url_objs = [URL(**doc) for doc in url_docs]
|
633 |
+
|
634 |
+
# Update statistics
|
635 |
+
self.stats['urls_retrieved'] += len(url_objs)
|
636 |
+
|
637 |
+
return url_objs
|
638 |
+
except Exception as e:
|
639 |
+
logger.error(f"Error retrieving URLs by status {status}: {e}")
|
640 |
+
self.stats['storage_errors'] += 1
|
641 |
+
return []
|
642 |
+
|
643 |
+
def get_urls_by_domain(self, domain: str, limit: int = 100) -> List[URL]:
|
644 |
+
"""
|
645 |
+
Retrieve URLs by domain
|
646 |
+
|
647 |
+
Args:
|
648 |
+
domain: Domain of URLs to retrieve
|
649 |
+
limit: Maximum number of URLs to retrieve
|
650 |
+
|
651 |
+
Returns:
|
652 |
+
List of URL objects
|
653 |
+
"""
|
654 |
+
try:
|
655 |
+
# Get URLs from MongoDB
|
656 |
+
url_docs = list(self.urls_collection.find({'domain': domain}).limit(limit))
|
657 |
+
|
658 |
+
# Create URL objects from documents
|
659 |
+
url_objs = [URL(**doc) for doc in url_docs]
|
660 |
+
|
661 |
+
# Update statistics
|
662 |
+
self.stats['urls_retrieved'] += len(url_objs)
|
663 |
+
|
664 |
+
return url_objs
|
665 |
+
except Exception as e:
|
666 |
+
logger.error(f"Error retrieving URLs by domain {domain}: {e}")
|
667 |
+
self.stats['storage_errors'] += 1
|
668 |
+
return []
|
669 |
+
|
670 |
+
def store_stats(self, stats: Dict[str, Any]) -> bool:
|
671 |
+
"""
|
672 |
+
Store crawler statistics
|
673 |
+
|
674 |
+
Args:
|
675 |
+
stats: Statistics to store
|
676 |
+
|
677 |
+
Returns:
|
678 |
+
True if successful, False otherwise
|
679 |
+
"""
|
680 |
+
try:
|
681 |
+
# Create statistics document
|
682 |
+
stats_doc = stats.copy()
|
683 |
+
stats_doc['timestamp'] = datetime.datetime.now()
|
684 |
+
|
685 |
+
# Convert sets to lists for MongoDB
|
686 |
+
for key, value in stats_doc.items():
|
687 |
+
if isinstance(value, set):
|
688 |
+
stats_doc[key] = list(value)
|
689 |
+
|
690 |
+
# Store in MongoDB
|
691 |
+
self.stats_collection.insert_one(stats_doc)
|
692 |
+
|
693 |
+
return True
|
694 |
+
except Exception as e:
|
695 |
+
logger.error(f"Error storing statistics: {e}")
|
696 |
+
self.stats['storage_errors'] += 1
|
697 |
+
return False
|
698 |
+
|
699 |
+
def _check_disk_space(self) -> bool:
|
700 |
+
"""
|
701 |
+
Check if disk space limit is exceeded
|
702 |
+
|
703 |
+
Returns:
|
704 |
+
True if space is available, False otherwise
|
705 |
+
"""
|
706 |
+
# Convert max disk usage to bytes
|
707 |
+
max_bytes = self.max_disk_usage_gb * 1024 * 1024 * 1024
|
708 |
+
|
709 |
+
# Check if limit is exceeded
|
710 |
+
return self.stats['disk_space_used'] < max_bytes
|
711 |
+
|
712 |
+
def _extract_domain(self, url: str) -> str:
|
713 |
+
"""Extract domain from URL"""
|
714 |
+
parsed = urlparse(url)
|
715 |
+
return parsed.netloc.replace(':', '_')
|
716 |
+
|
717 |
+
def _url_to_filename(self, url: str) -> str:
|
718 |
+
"""Convert URL to filename"""
|
719 |
+
# Hash the URL to create a safe filename
|
720 |
+
return hashlib.md5(url.encode('utf-8')).hexdigest()
|
721 |
+
|
722 |
+
def clean_old_pages(self, days: int = 90) -> int:
|
723 |
+
"""
|
724 |
+
Remove pages older than a specified number of days
|
725 |
+
|
726 |
+
Args:
|
727 |
+
days: Number of days after which pages are considered old
|
728 |
+
|
729 |
+
Returns:
|
730 |
+
Number of pages removed
|
731 |
+
"""
|
732 |
+
try:
|
733 |
+
# Calculate cutoff date
|
734 |
+
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days)
|
735 |
+
|
736 |
+
# Find old pages
|
737 |
+
old_pages = list(self.pages_collection.find({
|
738 |
+
'crawled_at': {'$lt': cutoff_date}
|
739 |
+
}, {'url': 1}))
|
740 |
+
|
741 |
+
if not old_pages:
|
742 |
+
logger.info(f"No pages older than {days} days found")
|
743 |
+
return 0
|
744 |
+
|
745 |
+
# Remove from database
|
746 |
+
delete_result = self.pages_collection.delete_many({
|
747 |
+
'crawled_at': {'$lt': cutoff_date}
|
748 |
+
})
|
749 |
+
|
750 |
+
# Remove content files
|
751 |
+
count = 0
|
752 |
+
for page in old_pages:
|
753 |
+
url = page['url']
|
754 |
+
domain = self._extract_domain(url)
|
755 |
+
filename = self._url_to_filename(url)
|
756 |
+
|
757 |
+
# Check disk
|
758 |
+
compressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.gz")
|
759 |
+
uncompressed_path = os.path.join(config.HTML_STORAGE_PATH, domain, f"{filename}.html")
|
760 |
+
|
761 |
+
if os.path.exists(compressed_path):
|
762 |
+
os.remove(compressed_path)
|
763 |
+
count += 1
|
764 |
+
|
765 |
+
if os.path.exists(uncompressed_path):
|
766 |
+
os.remove(uncompressed_path)
|
767 |
+
count += 1
|
768 |
+
|
769 |
+
# Check S3
|
770 |
+
if self.s3_client:
|
771 |
+
s3_key_compressed = f"{domain}/{filename}.gz"
|
772 |
+
s3_key_uncompressed = f"{domain}/{filename}.html"
|
773 |
+
|
774 |
+
try:
|
775 |
+
self.s3_client.delete_object(
|
776 |
+
Bucket=config.S3_BUCKET,
|
777 |
+
Key=s3_key_compressed
|
778 |
+
)
|
779 |
+
count += 1
|
780 |
+
except:
|
781 |
+
pass
|
782 |
+
|
783 |
+
try:
|
784 |
+
self.s3_client.delete_object(
|
785 |
+
Bucket=config.S3_BUCKET,
|
786 |
+
Key=s3_key_uncompressed
|
787 |
+
)
|
788 |
+
count += 1
|
789 |
+
except:
|
790 |
+
pass
|
791 |
+
|
792 |
+
logger.info(f"Removed {delete_result.deleted_count} old pages and {count} content files")
|
793 |
+
return delete_result.deleted_count
|
794 |
+
except Exception as e:
|
795 |
+
logger.error(f"Error cleaning old pages: {e}")
|
796 |
+
self.stats['storage_errors'] += 1
|
797 |
+
return 0
|
798 |
+
|
799 |
+
def clean_failed_urls(self, retries: int = 3) -> int:
|
800 |
+
"""
|
801 |
+
Remove URLs that have failed repeatedly
|
802 |
+
|
803 |
+
Args:
|
804 |
+
retries: Number of retries after which a URL is considered permanently failed
|
805 |
+
|
806 |
+
Returns:
|
807 |
+
Number of URLs removed
|
808 |
+
"""
|
809 |
+
try:
|
810 |
+
# Delete failed URLs with too many retries
|
811 |
+
delete_result = self.urls_collection.delete_many({
|
812 |
+
'status': 'FAILED',
|
813 |
+
'retries': {'$gte': retries}
|
814 |
+
})
|
815 |
+
|
816 |
+
logger.info(f"Removed {delete_result.deleted_count} permanently failed URLs")
|
817 |
+
return delete_result.deleted_count
|
818 |
+
except Exception as e:
|
819 |
+
logger.error(f"Error cleaning failed URLs: {e}")
|
820 |
+
self.stats['storage_errors'] += 1
|
821 |
+
return 0
|
822 |
+
|
823 |
+
def calculate_storage_stats(self) -> Dict[str, Any]:
|
824 |
+
"""
|
825 |
+
Calculate storage statistics
|
826 |
+
|
827 |
+
Returns:
|
828 |
+
Dictionary of storage statistics
|
829 |
+
"""
|
830 |
+
stats = {
|
831 |
+
'timestamp': datetime.datetime.now(),
|
832 |
+
'pages_count': 0,
|
833 |
+
'urls_count': 0,
|
834 |
+
'disk_space_used_mb': 0,
|
835 |
+
's3_objects_count': 0,
|
836 |
+
'mongodb_size_mb': 0,
|
837 |
+
}
|
838 |
+
|
839 |
+
try:
|
840 |
+
# Count pages and URLs
|
841 |
+
stats['pages_count'] = self.pages_collection.count_documents({})
|
842 |
+
stats['urls_count'] = self.urls_collection.count_documents({})
|
843 |
+
|
844 |
+
# Calculate disk space used
|
845 |
+
total_size = 0
|
846 |
+
for root, _, files in os.walk(config.HTML_STORAGE_PATH):
|
847 |
+
total_size += sum(os.path.getsize(os.path.join(root, name)) for name in files)
|
848 |
+
stats['disk_space_used_mb'] = total_size / (1024 * 1024)
|
849 |
+
|
850 |
+
# Calculate MongoDB size
|
851 |
+
db_stats = self.db.command('dbStats')
|
852 |
+
stats['mongodb_size_mb'] = db_stats['storageSize'] / (1024 * 1024)
|
853 |
+
|
854 |
+
# Count S3 objects if enabled
|
855 |
+
if self.s3_client:
|
856 |
+
try:
|
857 |
+
s3_objects = 0
|
858 |
+
paginator = self.s3_client.get_paginator('list_objects_v2')
|
859 |
+
for page in paginator.paginate(Bucket=config.S3_BUCKET):
|
860 |
+
if 'Contents' in page:
|
861 |
+
s3_objects += len(page['Contents'])
|
862 |
+
stats['s3_objects_count'] = s3_objects
|
863 |
+
except Exception as e:
|
864 |
+
logger.error(f"Error counting S3 objects: {e}")
|
865 |
+
|
866 |
+
# Update internal statistics
|
867 |
+
self.stats['disk_space_used'] = total_size
|
868 |
+
self.stats['mongodb_size'] = db_stats['storageSize']
|
869 |
+
|
870 |
+
return stats
|
871 |
+
except Exception as e:
|
872 |
+
logger.error(f"Error calculating storage statistics: {e}")
|
873 |
+
self.stats['storage_errors'] += 1
|
874 |
+
return stats
|
875 |
+
|
876 |
+
def close(self) -> None:
|
877 |
+
"""Close connections and perform cleanup"""
|
878 |
+
# Flush any pending buffers
|
879 |
+
self.flush_page_buffer()
|
880 |
+
self.flush_url_buffer()
|
881 |
+
|
882 |
+
# Close MongoDB connection
|
883 |
+
if self.mongo_client:
|
884 |
+
self.mongo_client.close()
|
885 |
+
logger.info("MongoDB connection closed")
|
886 |
+
|
887 |
+
# Log final statistics
|
888 |
+
logger.info(f"Storage manager closed. Pages stored: {self.stats['pages_stored']}, URLs stored: {self.stats['urls_stored']}")
|
test_crawler.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for the web crawler - tests only the URL frontier and downloader
|
4 |
+
without requiring MongoDB
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
import logging
|
11 |
+
import threading
|
12 |
+
from urllib.parse import urlparse
|
13 |
+
import redis
|
14 |
+
|
15 |
+
# Make sure we're in the right directory
|
16 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
17 |
+
os.chdir(script_dir)
|
18 |
+
|
19 |
+
# Set up logging
|
20 |
+
logging.basicConfig(
|
21 |
+
level=logging.INFO,
|
22 |
+
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
23 |
+
handlers=[
|
24 |
+
logging.StreamHandler(sys.stdout),
|
25 |
+
logging.FileHandler(os.path.join(script_dir, 'test_crawler.log'))
|
26 |
+
]
|
27 |
+
)
|
28 |
+
logger = logging.getLogger("test_crawler")
|
29 |
+
|
30 |
+
# Import our modules
|
31 |
+
import config
|
32 |
+
from frontier import URLFrontier
|
33 |
+
from models import URL, Priority, URLStatus
|
34 |
+
from downloader import HTMLDownloader
|
35 |
+
from parser import HTMLParser
|
36 |
+
from robots import RobotsHandler
|
37 |
+
from dns_resolver import DNSResolver
|
38 |
+
|
39 |
+
# Import local configuration if available
|
40 |
+
try:
|
41 |
+
import local_config
|
42 |
+
# Override config settings with local settings
|
43 |
+
for key in dir(local_config):
|
44 |
+
if key.isupper():
|
45 |
+
setattr(config, key, getattr(local_config, key))
|
46 |
+
logger.info("Loaded local configuration")
|
47 |
+
except ImportError:
|
48 |
+
logger.warning("No local_config.py found - using default config")
|
49 |
+
|
50 |
+
def test_redis():
|
51 |
+
"""Test Redis connection"""
|
52 |
+
try:
|
53 |
+
logger.info(f"Testing Redis connection to {config.REDIS_URI}")
|
54 |
+
r = redis.from_url(config.REDIS_URI)
|
55 |
+
r.ping()
|
56 |
+
logger.info("Redis connection successful")
|
57 |
+
return True
|
58 |
+
except Exception as e:
|
59 |
+
logger.error(f"Redis connection failed: {e}")
|
60 |
+
return False
|
61 |
+
|
62 |
+
def test_robots_txt():
|
63 |
+
"""Test robots.txt handling"""
|
64 |
+
try:
|
65 |
+
logger.info("Testing robots.txt handling")
|
66 |
+
robots_handler = RobotsHandler()
|
67 |
+
test_urls = [
|
68 |
+
"https://www.google.com/",
|
69 |
+
"https://www.github.com/",
|
70 |
+
"https://sagarnildas.com/",
|
71 |
+
]
|
72 |
+
|
73 |
+
for url in test_urls:
|
74 |
+
logger.info(f"Checking robots.txt for {url}")
|
75 |
+
allowed, crawl_delay = robots_handler.can_fetch(url)
|
76 |
+
logger.info(f" Allowed: {allowed}, Crawl delay: {crawl_delay}")
|
77 |
+
|
78 |
+
return True
|
79 |
+
except Exception as e:
|
80 |
+
logger.error(f"Error testing robots.txt: {e}")
|
81 |
+
return False
|
82 |
+
|
83 |
+
def test_dns_resolver():
|
84 |
+
"""Test DNS resolver"""
|
85 |
+
try:
|
86 |
+
logger.info("Testing DNS resolver")
|
87 |
+
dns_resolver = DNSResolver()
|
88 |
+
test_domains = [
|
89 |
+
"www.google.com",
|
90 |
+
"www.github.com",
|
91 |
+
"example.com",
|
92 |
+
]
|
93 |
+
|
94 |
+
for domain in test_domains:
|
95 |
+
logger.info(f"Resolving {domain}")
|
96 |
+
ip = dns_resolver.resolve(f"https://{domain}/")
|
97 |
+
logger.info(f" IP: {ip}")
|
98 |
+
|
99 |
+
return True
|
100 |
+
except Exception as e:
|
101 |
+
logger.error(f"Error testing DNS resolver: {e}")
|
102 |
+
return False
|
103 |
+
|
104 |
+
def test_url_frontier():
|
105 |
+
"""Test URL frontier"""
|
106 |
+
try:
|
107 |
+
logger.info("Testing URL frontier")
|
108 |
+
frontier = URLFrontier()
|
109 |
+
|
110 |
+
# Clear frontier
|
111 |
+
frontier.clear()
|
112 |
+
|
113 |
+
# Add some URLs
|
114 |
+
test_urls = [
|
115 |
+
"https://www.google.com/",
|
116 |
+
"https://www.github.com/",
|
117 |
+
"https://sagarnildas.com/",
|
118 |
+
]
|
119 |
+
|
120 |
+
for i, url in enumerate(test_urls):
|
121 |
+
url_obj = URL(
|
122 |
+
url=url,
|
123 |
+
priority=Priority.MEDIUM,
|
124 |
+
status=URLStatus.PENDING,
|
125 |
+
depth=0
|
126 |
+
)
|
127 |
+
added = frontier.add_url(url_obj)
|
128 |
+
logger.info(f"Added {url}: {added}")
|
129 |
+
|
130 |
+
# Check size
|
131 |
+
size = frontier.size()
|
132 |
+
logger.info(f"Frontier size: {size}")
|
133 |
+
|
134 |
+
# Get next URL
|
135 |
+
url = frontier.get_next_url()
|
136 |
+
if url:
|
137 |
+
logger.info(f"Next URL: {url.url} (priority: {url.priority})")
|
138 |
+
else:
|
139 |
+
logger.info("No URL available")
|
140 |
+
|
141 |
+
return True
|
142 |
+
except Exception as e:
|
143 |
+
logger.error(f"Error testing URL frontier: {e}")
|
144 |
+
return False
|
145 |
+
|
146 |
+
def test_downloader():
|
147 |
+
"""Test HTML downloader"""
|
148 |
+
try:
|
149 |
+
logger.info("Testing HTML downloader")
|
150 |
+
downloader = HTMLDownloader()
|
151 |
+
|
152 |
+
test_urls = [
|
153 |
+
URL(url="https://sagarnildas.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
|
154 |
+
URL(url="https://www.google.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
|
155 |
+
]
|
156 |
+
|
157 |
+
for url_obj in test_urls:
|
158 |
+
logger.info(f"Downloading {url_obj.url}")
|
159 |
+
page = downloader.download(url_obj)
|
160 |
+
if page:
|
161 |
+
logger.info(f" Downloaded {page.content_length} bytes, status: {page.status_code}")
|
162 |
+
# Test parsing
|
163 |
+
parser = HTMLParser()
|
164 |
+
urls, metadata = parser.parse(page)
|
165 |
+
logger.info(f" Extracted {len(urls)} URLs and {len(metadata)} metadata items")
|
166 |
+
else:
|
167 |
+
logger.info(f" Download failed: {url_obj.error}")
|
168 |
+
|
169 |
+
return True
|
170 |
+
except Exception as e:
|
171 |
+
logger.error(f"Error testing HTML downloader: {e}")
|
172 |
+
return False
|
173 |
+
|
174 |
+
def run_tests():
|
175 |
+
"""Run all tests"""
|
176 |
+
logger.info("Starting crawler component tests")
|
177 |
+
|
178 |
+
tests = [
|
179 |
+
("Redis", test_redis),
|
180 |
+
("Robots.txt", test_robots_txt),
|
181 |
+
("DNS Resolver", test_dns_resolver),
|
182 |
+
("URL Frontier", test_url_frontier),
|
183 |
+
("HTML Downloader", test_downloader),
|
184 |
+
]
|
185 |
+
|
186 |
+
results = []
|
187 |
+
for name, test_func in tests:
|
188 |
+
logger.info(f"\n=== Testing {name} ===")
|
189 |
+
start_time = time.time()
|
190 |
+
success = test_func()
|
191 |
+
elapsed = time.time() - start_time
|
192 |
+
|
193 |
+
result = {
|
194 |
+
"name": name,
|
195 |
+
"success": success,
|
196 |
+
"time": elapsed
|
197 |
+
}
|
198 |
+
results.append(result)
|
199 |
+
|
200 |
+
logger.info(f"=== {name} test {'succeeded' if success else 'failed'} in {elapsed:.2f}s ===\n")
|
201 |
+
|
202 |
+
# Print summary
|
203 |
+
logger.info("\n=== Test Summary ===")
|
204 |
+
all_success = True
|
205 |
+
for result in results:
|
206 |
+
status = "SUCCESS" if result["success"] else "FAILED"
|
207 |
+
logger.info(f"{result['name']}: {status} ({result['time']:.2f}s)")
|
208 |
+
if not result["success"]:
|
209 |
+
all_success = False
|
210 |
+
|
211 |
+
if all_success:
|
212 |
+
logger.info("All tests passed!")
|
213 |
+
else:
|
214 |
+
logger.warning("Some tests failed. Check logs for details.")
|
215 |
+
|
216 |
+
return all_success
|
217 |
+
|
218 |
+
if __name__ == "__main__":
|
219 |
+
run_tests()
|