File size: 6,565 Bytes
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
"""
Test script for the web crawler - tests only the URL frontier and downloader
without requiring MongoDB
"""

import os
import sys
import time
import logging
import threading
from urllib.parse import urlparse
import redis

# Make sure we're in the right directory
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(os.path.join(script_dir, 'test_crawler.log'))
    ]
)
logger = logging.getLogger("test_crawler")

# Import our modules
import config
from frontier import URLFrontier
from models import URL, Priority, URLStatus
from downloader import HTMLDownloader
from parser import HTMLParser
from robots import RobotsHandler
from dns_resolver import DNSResolver

# Import local configuration if available
try:
    import local_config
    # Override config settings with local settings
    for key in dir(local_config):
        if key.isupper():
            setattr(config, key, getattr(local_config, key))
    logger.info("Loaded local configuration")
except ImportError:
    logger.warning("No local_config.py found - using default config")

def test_redis():
    """Test Redis connection"""
    try:
        logger.info(f"Testing Redis connection to {config.REDIS_URI}")
        r = redis.from_url(config.REDIS_URI)
        r.ping()
        logger.info("Redis connection successful")
        return True
    except Exception as e:
        logger.error(f"Redis connection failed: {e}")
        return False

def test_robots_txt():
    """Test robots.txt handling"""
    try:
        logger.info("Testing robots.txt handling")
        robots_handler = RobotsHandler()
        test_urls = [
            "https://www.google.com/",
            "https://www.github.com/",
            "https://sagarnildas.com/",
        ]
        
        for url in test_urls:
            logger.info(f"Checking robots.txt for {url}")
            allowed, crawl_delay = robots_handler.can_fetch(url)
            logger.info(f"  Allowed: {allowed}, Crawl delay: {crawl_delay}")
        
        return True
    except Exception as e:
        logger.error(f"Error testing robots.txt: {e}")
        return False

def test_dns_resolver():
    """Test DNS resolver"""
    try:
        logger.info("Testing DNS resolver")
        dns_resolver = DNSResolver()
        test_domains = [
            "www.google.com",
            "www.github.com",
            "example.com",
        ]
        
        for domain in test_domains:
            logger.info(f"Resolving {domain}")
            ip = dns_resolver.resolve(f"https://{domain}/")
            logger.info(f"  IP: {ip}")
        
        return True
    except Exception as e:
        logger.error(f"Error testing DNS resolver: {e}")
        return False

def test_url_frontier():
    """Test URL frontier"""
    try:
        logger.info("Testing URL frontier")
        frontier = URLFrontier()
        
        # Clear frontier
        frontier.clear()
        
        # Add some URLs
        test_urls = [
            "https://www.google.com/",
            "https://www.github.com/",
            "https://sagarnildas.com/",
        ]
        
        for i, url in enumerate(test_urls):
            url_obj = URL(
                url=url,
                priority=Priority.MEDIUM,
                status=URLStatus.PENDING,
                depth=0
            )
            added = frontier.add_url(url_obj)
            logger.info(f"Added {url}: {added}")
        
        # Check size
        size = frontier.size()
        logger.info(f"Frontier size: {size}")
        
        # Get next URL
        url = frontier.get_next_url()
        if url:
            logger.info(f"Next URL: {url.url} (priority: {url.priority})")
        else:
            logger.info("No URL available")
        
        return True
    except Exception as e:
        logger.error(f"Error testing URL frontier: {e}")
        return False

def test_downloader():
    """Test HTML downloader"""
    try:
        logger.info("Testing HTML downloader")
        downloader = HTMLDownloader()
        
        test_urls = [
            URL(url="https://sagarnildas.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
            URL(url="https://www.google.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
        ]
        
        for url_obj in test_urls:
            logger.info(f"Downloading {url_obj.url}")
            page = downloader.download(url_obj)
            if page:
                logger.info(f"  Downloaded {page.content_length} bytes, status: {page.status_code}")
                # Test parsing
                parser = HTMLParser()
                urls, metadata = parser.parse(page)
                logger.info(f"  Extracted {len(urls)} URLs and {len(metadata)} metadata items")
            else:
                logger.info(f"  Download failed: {url_obj.error}")
        
        return True
    except Exception as e:
        logger.error(f"Error testing HTML downloader: {e}")
        return False

def run_tests():
    """Run all tests"""
    logger.info("Starting crawler component tests")
    
    tests = [
        ("Redis", test_redis),
        ("Robots.txt", test_robots_txt),
        ("DNS Resolver", test_dns_resolver),
        ("URL Frontier", test_url_frontier),
        ("HTML Downloader", test_downloader),
    ]
    
    results = []
    for name, test_func in tests:
        logger.info(f"\n=== Testing {name} ===")
        start_time = time.time()
        success = test_func()
        elapsed = time.time() - start_time
        
        result = {
            "name": name,
            "success": success,
            "time": elapsed
        }
        results.append(result)
        
        logger.info(f"=== {name} test {'succeeded' if success else 'failed'} in {elapsed:.2f}s ===\n")
    
    # Print summary
    logger.info("\n=== Test Summary ===")
    all_success = True
    for result in results:
        status = "SUCCESS" if result["success"] else "FAILED"
        logger.info(f"{result['name']}: {status} ({result['time']:.2f}s)")
        if not result["success"]:
            all_success = False
    
    if all_success:
        logger.info("All tests passed!")
    else:
        logger.warning("Some tests failed. Check logs for details.")
    
    return all_success

if __name__ == "__main__":
    run_tests()