File size: 11,934 Bytes
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
"""
HTML Parser and URL Extractor component for web crawler
"""

import logging
import re
from typing import Dict, List, Set, Tuple, Optional, Any
from urllib.parse import urlparse, urljoin, unquote
from bs4 import BeautifulSoup
import tldextract
import hashlib
import os

from models import URL, Page, Priority, normalize_url
import config

# Configure logging
logging.basicConfig(
    level=getattr(logging, config.LOG_LEVEL),
    format=config.LOG_FORMAT
)
logger = logging.getLogger(__name__)


class HTMLParser:
    """
    Parses HTML content and extracts URLs and other information
    """
    
    def __init__(self):
        """Initialize HTML parser"""
        # Compile URL filter regex patterns for efficiency
        self.url_filters = [re.compile(pattern) for pattern in config.URL_FILTERS]
    
    def parse(self, page: Page, base_url: Optional[str] = None) -> Tuple[List[str], Dict[str, Any]]:
        """
        Parse HTML content and extract URLs and metadata
        
        Args:
            page: Page object containing HTML content
            base_url: Base URL for resolving relative links (defaults to page URL)
            
        Returns:
            Tuple of (extracted URLs, metadata)
        """
        if not page or not page.content:
            return [], {}
        
        # Use page URL as base URL if not provided
        if not base_url:
            base_url = page.url
            
        # Parse HTML content
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # Extract URLs
        urls = self._extract_urls(soup, base_url)
        
        # Extract metadata
        metadata = self._extract_metadata(soup)
        
        return urls, metadata
    
    def _extract_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        """
        Extract and normalize URLs from HTML content
        
        Args:
            soup: BeautifulSoup object
            base_url: Base URL for resolving relative links
            
        Returns:
            List of normalized URLs
        """
        urls = set()
        all_urls = set()  # Track all URLs before filtering
        filtered_urls = set()  # Track filtered URLs
        
        logger.debug(f"Extracting URLs from page: {base_url}")
        
        # Extract URLs from <a> tags
        for link in soup.find_all('a', href=True):
            href = link['href'].strip()
            if href and not href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
                # Resolve relative URLs
                try:
                    absolute_url = urljoin(base_url, href)
                    all_urls.add(absolute_url)
                    # Normalize URL
                    normalized_url = normalize_url(absolute_url)
                    # Apply URL filters
                    if self._should_allow_url(normalized_url):
                        urls.add(normalized_url)
                    else:
                        filtered_urls.add(normalized_url)
                except Exception as e:
                    logger.debug(f"Error processing URL {href}: {e}")
        
        # Extract URLs from other elements like <iframe>, <frame>, <img>, etc.
        for tag_name, attr in [('frame', 'src'), ('iframe', 'src'), ('img', 'src'),
                               ('link', 'href'), ('script', 'src'), ('area', 'href')]:
            for tag in soup.find_all(tag_name, attrs={attr: True}):
                url = tag[attr].strip()
                if url and not url.startswith(('#', 'javascript:', 'data:', 'mailto:', 'tel:')):
                    try:
                        absolute_url = urljoin(base_url, url)
                        all_urls.add(absolute_url)
                        normalized_url = normalize_url(absolute_url)
                        if self._should_allow_url(normalized_url):
                            urls.add(normalized_url)
                        else:
                            filtered_urls.add(normalized_url)
                    except Exception as e:
                        logger.debug(f"Error processing URL {url}: {e}")
        
        # Log statistics
        logger.debug(f"Found {len(all_urls)} total URLs")
        logger.debug(f"Filtered {len(filtered_urls)} URLs")
        logger.debug(f"Accepted {len(urls)} URLs")
        
        # Log some example filtered URLs for debugging
        if filtered_urls:
            sample_filtered = list(filtered_urls)[:5]
            logger.debug(f"Sample filtered URLs: {sample_filtered}")
        
        # Return list of unique URLs
        return list(urls)
    
    def _should_allow_url(self, url: str) -> bool:
        """
        Check if URL should be allowed based on filters
        
        Args:
            url: URL to check
            
        Returns:
            True if URL should be allowed, False otherwise
        """
        try:
            parsed = urlparse(url)
            
            # Check scheme
            if parsed.scheme not in config.ALLOWED_SCHEMES:
                logger.debug(f"URL filtered - invalid scheme: {url}")
                return False
            
            # Check domain restrictions
            domain = self._extract_domain(url)
            
            # Check allowed domains if set
            if config.ALLOWED_DOMAINS and domain not in config.ALLOWED_DOMAINS:
                logger.debug(f"URL filtered - domain not allowed: {url} (domain: {domain}, allowed: {config.ALLOWED_DOMAINS})")
                return False
            
            # Check excluded domains
            if domain in config.EXCLUDED_DOMAINS:
                logger.debug(f"URL filtered - domain excluded: {url}")
                return False
            
            # Check URL filters
            for pattern in self.url_filters:
                if pattern.match(url):
                    logger.debug(f"URL filtered - pattern match: {url}")
                    return False
            
            return True
            
        except Exception as e:
            logger.debug(f"Error checking URL {url}: {e}")
            return False
    
    def _extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
        """
        Extract metadata from HTML content
        
        Args:
            soup: BeautifulSoup object
            
        Returns:
            Dictionary of metadata
        """
        metadata = {}
        
        # Extract title
        title_tag = soup.find('title')
        if title_tag and title_tag.string:
            metadata['title'] = title_tag.string.strip()
        
        # Extract meta description
        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag and description_tag.get('content'):
            metadata['description'] = description_tag['content'].strip()
        
        # Extract meta keywords
        keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
        if keywords_tag and keywords_tag.get('content'):
            metadata['keywords'] = [k.strip() for k in keywords_tag['content'].split(',')]
        
        # Extract canonical URL
        canonical_tag = soup.find('link', attrs={'rel': 'canonical'})
        if canonical_tag and canonical_tag.get('href'):
            metadata['canonical_url'] = canonical_tag['href'].strip()
        
        # Extract robots meta
        robots_tag = soup.find('meta', attrs={'name': 'robots'})
        if robots_tag and robots_tag.get('content'):
            metadata['robots'] = robots_tag['content'].strip()
        
        # Extract Open Graph metadata
        og_metadata = {}
        for meta_tag in soup.find_all('meta', attrs={'property': re.compile('^og:')}):
            if meta_tag.get('content'):
                property_name = meta_tag['property'][3:]  # Remove 'og:' prefix
                og_metadata[property_name] = meta_tag['content'].strip()
        
        if og_metadata:
            metadata['open_graph'] = og_metadata
        
        # Extract Twitter Card metadata
        twitter_metadata = {}
        for meta_tag in soup.find_all('meta', attrs={'name': re.compile('^twitter:')}):
            if meta_tag.get('content'):
                property_name = meta_tag['name'][8:]  # Remove 'twitter:' prefix
                twitter_metadata[property_name] = meta_tag['content'].strip()
        
        if twitter_metadata:
            metadata['twitter_card'] = twitter_metadata
        
        # Extract schema.org structured data (JSON-LD)
        schema_metadata = []
        for script in soup.find_all('script', attrs={'type': 'application/ld+json'}):
            if script.string:
                try:
                    import json
                    schema_data = json.loads(script.string)
                    schema_metadata.append(schema_data)
                except Exception as e:
                    logger.debug(f"Error parsing JSON-LD: {e}")
        
        if schema_metadata:
            metadata['structured_data'] = schema_metadata
        
        # Extract text content statistics
        text_content = soup.get_text(separator=' ', strip=True)
        if text_content:
            word_count = len(text_content.split())
            metadata['word_count'] = word_count
            metadata['text_length'] = len(text_content)
        
        return metadata
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        parsed = tldextract.extract(url)
        return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
    
    def calculate_priority(self, url: str, metadata: Dict[str, Any]) -> Priority:
        """
        Calculate priority for a URL based on various factors
        
        Args:
            url: URL to calculate priority for
            metadata: Metadata extracted from the page
            
        Returns:
            Priority enum value
        """
        # Default priority
        priority = Priority.MEDIUM
        
        try:
            # Extract path depth
            parsed = urlparse(url)
            path = parsed.path
            depth = len([p for p in path.split('/') if p])
            
            # Prioritize URLs with shorter paths
            if depth <= 1:
                priority = Priority.HIGH
            elif depth <= 3:
                priority = Priority.MEDIUM
            else:
                priority = Priority.LOW
            
            # Prioritize URLs with certain keywords in path
            if re.search(r'(article|blog|news|post)', path, re.IGNORECASE):
                priority = Priority.HIGH
            
            # Deprioritize URLs with pagination patterns
            if re.search(r'(page|p|pg)=\d+', url, re.IGNORECASE):
                priority = Priority.LOW
            
            # Check metadata
            if metadata:
                # Prioritize based on title
                title = metadata.get('title', '')
                if title and len(title) > 10:
                    priority = min(priority, Priority.MEDIUM)  # Raise priority if it's lower
                
                # Prioritize based on description
                description = metadata.get('description', '')
                if description and len(description) > 50:
                    priority = min(priority, Priority.MEDIUM)  # Raise priority if it's lower
                
                # Prioritize based on word count
                word_count = metadata.get('word_count', 0)
                if word_count > 1000:
                    priority = min(priority, Priority.HIGH)  # High priority for content-rich pages
                elif word_count > 500:
                    priority = min(priority, Priority.MEDIUM)
            
            return priority
            
        except Exception as e:
            logger.debug(f"Error calculating priority for URL {url}: {e}")
            return Priority.MEDIUM