AI_SEO_Crawler / test_crawler.py
sagarnildass's picture
Upload folder using huggingface_hub
6f509ec verified
#!/usr/bin/env python3
"""
Test script for the web crawler - tests only the URL frontier and downloader
without requiring MongoDB
"""
import os
import sys
import time
import logging
import threading
from urllib.parse import urlparse
import redis
# Make sure we're in the right directory
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(os.path.join(script_dir, 'test_crawler.log'))
]
)
logger = logging.getLogger("test_crawler")
# Import our modules
import config
from frontier import URLFrontier
from models import URL, Priority, URLStatus
from downloader import HTMLDownloader
from parser import HTMLParser
from robots import RobotsHandler
from dns_resolver import DNSResolver
# Import local configuration if available
try:
import local_config
# Override config settings with local settings
for key in dir(local_config):
if key.isupper():
setattr(config, key, getattr(local_config, key))
logger.info("Loaded local configuration")
except ImportError:
logger.warning("No local_config.py found - using default config")
def test_redis():
"""Test Redis connection"""
try:
logger.info(f"Testing Redis connection to {config.REDIS_URI}")
r = redis.from_url(config.REDIS_URI)
r.ping()
logger.info("Redis connection successful")
return True
except Exception as e:
logger.error(f"Redis connection failed: {e}")
return False
def test_robots_txt():
"""Test robots.txt handling"""
try:
logger.info("Testing robots.txt handling")
robots_handler = RobotsHandler()
test_urls = [
"https://www.google.com/",
"https://www.github.com/",
"https://sagarnildas.com/",
]
for url in test_urls:
logger.info(f"Checking robots.txt for {url}")
allowed, crawl_delay = robots_handler.can_fetch(url)
logger.info(f" Allowed: {allowed}, Crawl delay: {crawl_delay}")
return True
except Exception as e:
logger.error(f"Error testing robots.txt: {e}")
return False
def test_dns_resolver():
"""Test DNS resolver"""
try:
logger.info("Testing DNS resolver")
dns_resolver = DNSResolver()
test_domains = [
"www.google.com",
"www.github.com",
"example.com",
]
for domain in test_domains:
logger.info(f"Resolving {domain}")
ip = dns_resolver.resolve(f"https://{domain}/")
logger.info(f" IP: {ip}")
return True
except Exception as e:
logger.error(f"Error testing DNS resolver: {e}")
return False
def test_url_frontier():
"""Test URL frontier"""
try:
logger.info("Testing URL frontier")
frontier = URLFrontier()
# Clear frontier
frontier.clear()
# Add some URLs
test_urls = [
"https://www.google.com/",
"https://www.github.com/",
"https://sagarnildas.com/",
]
for i, url in enumerate(test_urls):
url_obj = URL(
url=url,
priority=Priority.MEDIUM,
status=URLStatus.PENDING,
depth=0
)
added = frontier.add_url(url_obj)
logger.info(f"Added {url}: {added}")
# Check size
size = frontier.size()
logger.info(f"Frontier size: {size}")
# Get next URL
url = frontier.get_next_url()
if url:
logger.info(f"Next URL: {url.url} (priority: {url.priority})")
else:
logger.info("No URL available")
return True
except Exception as e:
logger.error(f"Error testing URL frontier: {e}")
return False
def test_downloader():
"""Test HTML downloader"""
try:
logger.info("Testing HTML downloader")
downloader = HTMLDownloader()
test_urls = [
URL(url="https://sagarnildas.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
URL(url="https://www.google.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0),
]
for url_obj in test_urls:
logger.info(f"Downloading {url_obj.url}")
page = downloader.download(url_obj)
if page:
logger.info(f" Downloaded {page.content_length} bytes, status: {page.status_code}")
# Test parsing
parser = HTMLParser()
urls, metadata = parser.parse(page)
logger.info(f" Extracted {len(urls)} URLs and {len(metadata)} metadata items")
else:
logger.info(f" Download failed: {url_obj.error}")
return True
except Exception as e:
logger.error(f"Error testing HTML downloader: {e}")
return False
def run_tests():
"""Run all tests"""
logger.info("Starting crawler component tests")
tests = [
("Redis", test_redis),
("Robots.txt", test_robots_txt),
("DNS Resolver", test_dns_resolver),
("URL Frontier", test_url_frontier),
("HTML Downloader", test_downloader),
]
results = []
for name, test_func in tests:
logger.info(f"\n=== Testing {name} ===")
start_time = time.time()
success = test_func()
elapsed = time.time() - start_time
result = {
"name": name,
"success": success,
"time": elapsed
}
results.append(result)
logger.info(f"=== {name} test {'succeeded' if success else 'failed'} in {elapsed:.2f}s ===\n")
# Print summary
logger.info("\n=== Test Summary ===")
all_success = True
for result in results:
status = "SUCCESS" if result["success"] else "FAILED"
logger.info(f"{result['name']}: {status} ({result['time']:.2f}s)")
if not result["success"]:
all_success = False
if all_success:
logger.info("All tests passed!")
else:
logger.warning("Some tests failed. Check logs for details.")
return all_success
if __name__ == "__main__":
run_tests()