"""
HTML Parser and URL Extractor component for web crawler
"""
import logging
import re
from typing import Dict, List, Set, Tuple, Optional, Any
from urllib.parse import urlparse, urljoin, unquote
from bs4 import BeautifulSoup
import tldextract
import hashlib
import os
from models import URL, Page, Priority, normalize_url
import config
# Configure logging
logging.basicConfig(
level=getattr(logging, config.LOG_LEVEL),
format=config.LOG_FORMAT
)
logger = logging.getLogger(__name__)
class HTMLParser:
"""
Parses HTML content and extracts URLs and other information
"""
def __init__(self):
"""Initialize HTML parser"""
# Compile URL filter regex patterns for efficiency
self.url_filters = [re.compile(pattern) for pattern in config.URL_FILTERS]
def parse(self, page: Page, base_url: Optional[str] = None) -> Tuple[List[str], Dict[str, Any]]:
"""
Parse HTML content and extract URLs and metadata
Args:
page: Page object containing HTML content
base_url: Base URL for resolving relative links (defaults to page URL)
Returns:
Tuple of (extracted URLs, metadata)
"""
if not page or not page.content:
return [], {}
# Use page URL as base URL if not provided
if not base_url:
base_url = page.url
# Parse HTML content
soup = BeautifulSoup(page.content, 'html.parser')
# Extract URLs
urls = self._extract_urls(soup, base_url)
# Extract metadata
metadata = self._extract_metadata(soup)
return urls, metadata
def _extract_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]:
"""
Extract and normalize URLs from HTML content
Args:
soup: BeautifulSoup object
base_url: Base URL for resolving relative links
Returns:
List of normalized URLs
"""
urls = set()
all_urls = set() # Track all URLs before filtering
filtered_urls = set() # Track filtered URLs
logger.debug(f"Extracting URLs from page: {base_url}")
# Extract URLs from tags
for link in soup.find_all('a', href=True):
href = link['href'].strip()
if href and not href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
# Resolve relative URLs
try:
absolute_url = urljoin(base_url, href)
all_urls.add(absolute_url)
# Normalize URL
normalized_url = normalize_url(absolute_url)
# Apply URL filters
if self._should_allow_url(normalized_url):
urls.add(normalized_url)
else:
filtered_urls.add(normalized_url)
except Exception as e:
logger.debug(f"Error processing URL {href}: {e}")
# Extract URLs from other elements like