""" HTML Parser and URL Extractor component for web crawler """ import logging import re from typing import Dict, List, Set, Tuple, Optional, Any from urllib.parse import urlparse, urljoin, unquote from bs4 import BeautifulSoup import tldextract import hashlib import os from models import URL, Page, Priority, normalize_url import config # Configure logging logging.basicConfig( level=getattr(logging, config.LOG_LEVEL), format=config.LOG_FORMAT ) logger = logging.getLogger(__name__) class HTMLParser: """ Parses HTML content and extracts URLs and other information """ def __init__(self): """Initialize HTML parser""" # Compile URL filter regex patterns for efficiency self.url_filters = [re.compile(pattern) for pattern in config.URL_FILTERS] def parse(self, page: Page, base_url: Optional[str] = None) -> Tuple[List[str], Dict[str, Any]]: """ Parse HTML content and extract URLs and metadata Args: page: Page object containing HTML content base_url: Base URL for resolving relative links (defaults to page URL) Returns: Tuple of (extracted URLs, metadata) """ if not page or not page.content: return [], {} # Use page URL as base URL if not provided if not base_url: base_url = page.url # Parse HTML content soup = BeautifulSoup(page.content, 'html.parser') # Extract URLs urls = self._extract_urls(soup, base_url) # Extract metadata metadata = self._extract_metadata(soup) return urls, metadata def _extract_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]: """ Extract and normalize URLs from HTML content Args: soup: BeautifulSoup object base_url: Base URL for resolving relative links Returns: List of normalized URLs """ urls = set() all_urls = set() # Track all URLs before filtering filtered_urls = set() # Track filtered URLs logger.debug(f"Extracting URLs from page: {base_url}") # Extract URLs from tags for link in soup.find_all('a', href=True): href = link['href'].strip() if href and not href.startswith(('#', 'javascript:', 'mailto:', 'tel:')): # Resolve relative URLs try: absolute_url = urljoin(base_url, href) all_urls.add(absolute_url) # Normalize URL normalized_url = normalize_url(absolute_url) # Apply URL filters if self._should_allow_url(normalized_url): urls.add(normalized_url) else: filtered_urls.add(normalized_url) except Exception as e: logger.debug(f"Error processing URL {href}: {e}") # Extract URLs from other elements like