Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
|
@@ -88,13 +88,13 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
|
|
| 88 |
directory.mkdir(parents=True, exist_ok=True)
|
| 89 |
|
| 90 |
class EnhancedURLProcessor:
|
| 91 |
-
"""Advanced URL processing with enhanced content extraction"""
|
|
|
|
| 92 |
def __init__(self):
|
| 93 |
self.session = requests.Session()
|
| 94 |
self.timeout = 15 # Extended timeout for larger content
|
| 95 |
self.max_retries = 3
|
| 96 |
self.user_agent = UserAgent()
|
| 97 |
-
|
| 98 |
# Enhanced headers for better site compatibility
|
| 99 |
self.session.headers.update({
|
| 100 |
'User-Agent': self.user_agent.random,
|
|
@@ -110,7 +110,7 @@ class EnhancedURLProcessor:
|
|
| 110 |
'DNT': '1'
|
| 111 |
})
|
| 112 |
|
| 113 |
-
def validate_url(self, url: str) -> Dict:
|
| 114 |
"""Enhanced URL validation with detailed feedback"""
|
| 115 |
try:
|
| 116 |
if not validators.url(url):
|
|
@@ -123,36 +123,47 @@ class EnhancedURLProcessor:
|
|
| 123 |
head_response = self.session.head(url, timeout=5)
|
| 124 |
head_response.raise_for_status()
|
| 125 |
final_url = head_response.url # Capture potential redirects
|
|
|
|
|
|
|
|
|
|
| 126 |
except requests.exceptions.RequestException:
|
| 127 |
# If HEAD fails, try GET as some servers don't support HEAD
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
return {
|
| 133 |
'is_valid': True,
|
| 134 |
'message': 'URL is valid and accessible',
|
| 135 |
'details': {
|
| 136 |
'final_url': final_url,
|
| 137 |
-
'content_type':
|
| 138 |
-
'server':
|
| 139 |
-
'size':
|
| 140 |
}
|
| 141 |
}
|
| 142 |
except Exception as e:
|
| 143 |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
|
| 144 |
|
| 145 |
-
def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
|
| 146 |
"""Enhanced content fetcher with retry mechanism and complete character extraction"""
|
| 147 |
try:
|
| 148 |
logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
|
| 149 |
-
|
| 150 |
# Update User-Agent randomly for each request
|
| 151 |
self.session.headers.update({'User-Agent': self.user_agent.random})
|
| 152 |
-
|
| 153 |
response = self.session.get(url, timeout=self.timeout)
|
| 154 |
response.raise_for_status()
|
| 155 |
final_url = response.url # Capture potential redirects
|
|
|
|
| 156 |
|
| 157 |
# Detect encoding
|
| 158 |
if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
|
|
@@ -177,14 +188,13 @@ class EnhancedURLProcessor:
|
|
| 177 |
encoding = 'latin-1 (fallback)'
|
| 178 |
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
| 179 |
|
| 180 |
-
|
| 181 |
# Extract metadata
|
| 182 |
metadata = {
|
| 183 |
'original_url': url,
|
| 184 |
'final_url': final_url,
|
| 185 |
'timestamp': datetime.now().isoformat(),
|
| 186 |
'detected_encoding': encoding,
|
| 187 |
-
'content_type':
|
| 188 |
'content_length': len(response.content),
|
| 189 |
'headers': dict(response.headers),
|
| 190 |
'status_code': response.status_code
|
|
@@ -195,7 +205,7 @@ class EnhancedURLProcessor:
|
|
| 195 |
|
| 196 |
return {
|
| 197 |
'source': 'url',
|
| 198 |
-
'url': url, # Keep original URL as identifier
|
| 199 |
'raw_content': raw_content,
|
| 200 |
'metadata': metadata,
|
| 201 |
'extracted_data': processed_extraction['data'],
|
|
@@ -211,9 +221,9 @@ class EnhancedURLProcessor:
|
|
| 211 |
'source': 'url',
|
| 212 |
'url': url,
|
| 213 |
'raw_content': None,
|
| 214 |
-
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
|
| 215 |
'extracted_data': None,
|
| 216 |
-
'processing_notes': f"Failed to fetch content: {str(e)}"
|
| 217 |
}
|
| 218 |
except Exception as e:
|
| 219 |
logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
|
|
@@ -221,9 +231,9 @@ class EnhancedURLProcessor:
|
|
| 221 |
'source': 'url',
|
| 222 |
'url': url,
|
| 223 |
'raw_content': raw_content if 'raw_content' in locals() else None,
|
| 224 |
-
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
|
| 225 |
'extracted_data': None,
|
| 226 |
-
'processing_notes': f"Unexpected processing error: {str(e)}"
|
| 227 |
}
|
| 228 |
|
| 229 |
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
|
@@ -231,7 +241,6 @@ class EnhancedURLProcessor:
|
|
| 231 |
lower_content_type = content_type.lower()
|
| 232 |
notes = []
|
| 233 |
extracted_data: Any = None # Use Any to allow different types
|
| 234 |
-
|
| 235 |
try:
|
| 236 |
if 'text/html' in lower_content_type:
|
| 237 |
logger.debug(f"Processing HTML content from {base_url}")
|
|
@@ -253,10 +262,8 @@ class EnhancedURLProcessor:
|
|
| 253 |
elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
|
| 254 |
logger.debug(f"Processing XML content from {base_url}")
|
| 255 |
try:
|
| 256 |
-
# Try parsing XML. Convert to a string
|
| 257 |
-
# For simplicity, we'll convert to a readable string representation of the tree.
|
| 258 |
root = ET.fromstring(content)
|
| 259 |
-
# A simple way to represent XML as text
|
| 260 |
xml_text = ET.tostring(root, encoding='unicode', method='xml')
|
| 261 |
extracted_data = xml_text # Store as string for now
|
| 262 |
notes.append("Parsed as XML (text representation)")
|
|
@@ -276,17 +283,14 @@ class EnhancedURLProcessor:
|
|
| 276 |
logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
|
| 277 |
extracted_data = content # Store raw content for unknown types
|
| 278 |
notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
|
| 279 |
-
|
| 280 |
except Exception as e:
|
| 281 |
logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
|
| 282 |
extracted_data = content # Fallback to raw content on error
|
| 283 |
notes.append(f"Unexpected processing error: {e}. Stored raw text.")
|
| 284 |
-
|
| 285 |
return {'data': extracted_data, 'notes': notes}
|
| 286 |
|
| 287 |
-
|
| 288 |
def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
|
| 289 |
-
"""Process HTML content, preserving text, and extracting metadata."""
|
| 290 |
extracted: Dict[str, Any] = {
|
| 291 |
'title': None,
|
| 292 |
'meta_description': None, # Add extraction for meta description
|
|
@@ -306,23 +310,33 @@ class EnhancedURLProcessor:
|
|
| 306 |
extracted['meta_description'] = meta_desc['content'].strip()
|
| 307 |
|
| 308 |
# Extract and process links (convert relative to absolute)
|
|
|
|
|
|
|
| 309 |
for a_tag in soup.find_all('a', href=True):
|
| 310 |
-
href = a_tag['href']
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
# Extract all text content (similar to stripped_strings but ensures order)
|
| 320 |
-
text_parts = []
|
| 321 |
# Use a more robust way to get visible text, including handling script/style tags
|
| 322 |
-
|
|
|
|
| 323 |
script_or_style.extract() # Remove script and style tags
|
| 324 |
-
text =
|
| 325 |
-
|
| 326 |
# Clean up whitespace and empty lines
|
| 327 |
lines = text.splitlines()
|
| 328 |
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
|
@@ -330,11 +344,189 @@ class EnhancedURLProcessor:
|
|
| 330 |
|
| 331 |
except Exception as e:
|
| 332 |
logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
|
| 335 |
|
| 336 |
return extracted
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
class EnhancedFileProcessor:
|
| 339 |
"""Advanced file processing with enhanced content extraction"""
|
| 340 |
def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
|
|
|
|
| 88 |
directory.mkdir(parents=True, exist_ok=True)
|
| 89 |
|
| 90 |
class EnhancedURLProcessor:
|
| 91 |
+
"""Advanced URL processing with enhanced content extraction and recursive link following."""
|
| 92 |
+
|
| 93 |
def __init__(self):
|
| 94 |
self.session = requests.Session()
|
| 95 |
self.timeout = 15 # Extended timeout for larger content
|
| 96 |
self.max_retries = 3
|
| 97 |
self.user_agent = UserAgent()
|
|
|
|
| 98 |
# Enhanced headers for better site compatibility
|
| 99 |
self.session.headers.update({
|
| 100 |
'User-Agent': self.user_agent.random,
|
|
|
|
| 110 |
'DNT': '1'
|
| 111 |
})
|
| 112 |
|
| 113 |
+
def validate_url(self, url: str) -> Dict[str, Any]:
|
| 114 |
"""Enhanced URL validation with detailed feedback"""
|
| 115 |
try:
|
| 116 |
if not validators.url(url):
|
|
|
|
| 123 |
head_response = self.session.head(url, timeout=5)
|
| 124 |
head_response.raise_for_status()
|
| 125 |
final_url = head_response.url # Capture potential redirects
|
| 126 |
+
content_type = head_response.headers.get('Content-Type', 'unknown')
|
| 127 |
+
server = head_response.headers.get('Server', 'unknown')
|
| 128 |
+
size = head_response.headers.get('Content-Length', 'unknown')
|
| 129 |
except requests.exceptions.RequestException:
|
| 130 |
# If HEAD fails, try GET as some servers don't support HEAD
|
| 131 |
+
try:
|
| 132 |
+
response = self.session.get(url, timeout=self.timeout)
|
| 133 |
+
response.raise_for_status()
|
| 134 |
+
final_url = response.url # Capture potential redirects
|
| 135 |
+
content_type = response.headers.get('Content-Type', 'unknown')
|
| 136 |
+
server = response.headers.get('Server', 'unknown')
|
| 137 |
+
size = response.headers.get('Content-Length', 'unknown') # May not be accurate for full content
|
| 138 |
+
except requests.exceptions.RequestException as get_e:
|
| 139 |
+
return {'is_valid': False, 'message': f'URL not accessible after HEAD/GET attempts: {str(get_e)}', 'details': str(get_e)}
|
| 140 |
+
except Exception as get_e:
|
| 141 |
+
return {'is_valid': False, 'message': f'Unexpected error during GET validation: {str(get_e)}', 'details': str(get_e)}
|
| 142 |
+
|
| 143 |
|
| 144 |
return {
|
| 145 |
'is_valid': True,
|
| 146 |
'message': 'URL is valid and accessible',
|
| 147 |
'details': {
|
| 148 |
'final_url': final_url,
|
| 149 |
+
'content_type': content_type,
|
| 150 |
+
'server': server,
|
| 151 |
+
'size': size
|
| 152 |
}
|
| 153 |
}
|
| 154 |
except Exception as e:
|
| 155 |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
|
| 156 |
|
| 157 |
+
def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
|
| 158 |
"""Enhanced content fetcher with retry mechanism and complete character extraction"""
|
| 159 |
try:
|
| 160 |
logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
|
|
|
|
| 161 |
# Update User-Agent randomly for each request
|
| 162 |
self.session.headers.update({'User-Agent': self.user_agent.random})
|
|
|
|
| 163 |
response = self.session.get(url, timeout=self.timeout)
|
| 164 |
response.raise_for_status()
|
| 165 |
final_url = response.url # Capture potential redirects
|
| 166 |
+
content_type = response.headers.get('Content-Type', '')
|
| 167 |
|
| 168 |
# Detect encoding
|
| 169 |
if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
|
|
|
|
| 188 |
encoding = 'latin-1 (fallback)'
|
| 189 |
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
| 190 |
|
|
|
|
| 191 |
# Extract metadata
|
| 192 |
metadata = {
|
| 193 |
'original_url': url,
|
| 194 |
'final_url': final_url,
|
| 195 |
'timestamp': datetime.now().isoformat(),
|
| 196 |
'detected_encoding': encoding,
|
| 197 |
+
'content_type': content_type,
|
| 198 |
'content_length': len(response.content),
|
| 199 |
'headers': dict(response.headers),
|
| 200 |
'status_code': response.status_code
|
|
|
|
| 205 |
|
| 206 |
return {
|
| 207 |
'source': 'url',
|
| 208 |
+
'url': url, # Keep original URL as identifier for this step
|
| 209 |
'raw_content': raw_content,
|
| 210 |
'metadata': metadata,
|
| 211 |
'extracted_data': processed_extraction['data'],
|
|
|
|
| 221 |
'source': 'url',
|
| 222 |
'url': url,
|
| 223 |
'raw_content': None,
|
| 224 |
+
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
|
| 225 |
'extracted_data': None,
|
| 226 |
+
'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
|
| 227 |
}
|
| 228 |
except Exception as e:
|
| 229 |
logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
|
|
|
|
| 231 |
'source': 'url',
|
| 232 |
'url': url,
|
| 233 |
'raw_content': raw_content if 'raw_content' in locals() else None,
|
| 234 |
+
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
|
| 235 |
'extracted_data': None,
|
| 236 |
+
'processing_notes': [f"Unexpected processing error: {str(e)}"]
|
| 237 |
}
|
| 238 |
|
| 239 |
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
|
|
|
| 241 |
lower_content_type = content_type.lower()
|
| 242 |
notes = []
|
| 243 |
extracted_data: Any = None # Use Any to allow different types
|
|
|
|
| 244 |
try:
|
| 245 |
if 'text/html' in lower_content_type:
|
| 246 |
logger.debug(f"Processing HTML content from {base_url}")
|
|
|
|
| 262 |
elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
|
| 263 |
logger.debug(f"Processing XML content from {base_url}")
|
| 264 |
try:
|
| 265 |
+
# Try parsing XML. Convert to a string representation.
|
|
|
|
| 266 |
root = ET.fromstring(content)
|
|
|
|
| 267 |
xml_text = ET.tostring(root, encoding='unicode', method='xml')
|
| 268 |
extracted_data = xml_text # Store as string for now
|
| 269 |
notes.append("Parsed as XML (text representation)")
|
|
|
|
| 283 |
logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
|
| 284 |
extracted_data = content # Store raw content for unknown types
|
| 285 |
notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
|
|
|
|
| 286 |
except Exception as e:
|
| 287 |
logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
|
| 288 |
extracted_data = content # Fallback to raw content on error
|
| 289 |
notes.append(f"Unexpected processing error: {e}. Stored raw text.")
|
|
|
|
| 290 |
return {'data': extracted_data, 'notes': notes}
|
| 291 |
|
|
|
|
| 292 |
def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
|
| 293 |
+
"""Process HTML content, preserving text, and extracting metadata and links."""
|
| 294 |
extracted: Dict[str, Any] = {
|
| 295 |
'title': None,
|
| 296 |
'meta_description': None, # Add extraction for meta description
|
|
|
|
| 310 |
extracted['meta_description'] = meta_desc['content'].strip()
|
| 311 |
|
| 312 |
# Extract and process links (convert relative to absolute)
|
| 313 |
+
# Use a set to avoid duplicate URLs in the links list
|
| 314 |
+
unique_links = set()
|
| 315 |
for a_tag in soup.find_all('a', href=True):
|
| 316 |
+
href = a_tag['href'].strip()
|
| 317 |
+
if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): # Basic filter
|
| 318 |
+
text = a_tag.get_text().strip()
|
| 319 |
+
try:
|
| 320 |
+
absolute_url = urljoin(base_url, href)
|
| 321 |
+
if absolute_url not in unique_links:
|
| 322 |
+
extracted['links'].append({'text': text, 'url': absolute_url})
|
| 323 |
+
unique_links.add(absolute_url)
|
| 324 |
+
except Exception:
|
| 325 |
+
# If urljoin fails, keep the original href if it looks like a valid potential URL part
|
| 326 |
+
if validators.url(href) and href not in unique_links:
|
| 327 |
+
extracted['links'].append({'text': text, 'url': href})
|
| 328 |
+
unique_links.add(href)
|
| 329 |
+
elif urlparse(href).netloc and href not in unique_links: # Maybe just a domain/path?
|
| 330 |
+
extracted['links'].append({'text': text, 'url': href})
|
| 331 |
+
unique_links.add(href)
|
| 332 |
|
| 333 |
|
| 334 |
# Extract all text content (similar to stripped_strings but ensures order)
|
|
|
|
| 335 |
# Use a more robust way to get visible text, including handling script/style tags
|
| 336 |
+
soup_copy = BeautifulSoup(content, 'html.parser') # Work on a copy to preserve soup for links
|
| 337 |
+
for script_or_style in soup_copy(["script", "style"]):
|
| 338 |
script_or_style.extract() # Remove script and style tags
|
| 339 |
+
text = soup_copy.get_text(separator='\n') # Get text with newlines
|
|
|
|
| 340 |
# Clean up whitespace and empty lines
|
| 341 |
lines = text.splitlines()
|
| 342 |
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
|
|
|
| 344 |
|
| 345 |
except Exception as e:
|
| 346 |
logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
|
| 347 |
+
# Fallback: Store raw text and indicate error
|
| 348 |
+
soup_copy = BeautifulSoup(content, 'html.parser')
|
| 349 |
+
for script_or_style in soup_copy(["script", "style"]):
|
| 350 |
+
script_or_style.extract()
|
| 351 |
+
extracted['full_text'] = soup_copy.get_text(separator='\n').strip()
|
| 352 |
extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
|
| 353 |
|
| 354 |
return extracted
|
| 355 |
|
| 356 |
+
def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
|
| 357 |
+
"""
|
| 358 |
+
Fetches content from a URL and recursively follows links up to a specified depth.
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
url: The initial URL to fetch.
|
| 362 |
+
max_steps: The maximum number of levels to follow links (0-3).
|
| 363 |
+
0: Only fetch the initial URL.
|
| 364 |
+
1: Fetch the initial URL and the links found on that page.
|
| 365 |
+
2: Fetch the initial URL, its links, and the links on those pages.
|
| 366 |
+
3: Fetch up to the third level of links.
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
A dictionary containing the extraction result for the initial URL and
|
| 370 |
+
nested results for followed links.
|
| 371 |
+
"""
|
| 372 |
+
if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
|
| 373 |
+
logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
|
| 374 |
+
return {
|
| 375 |
+
'url': url,
|
| 376 |
+
'level': 0,
|
| 377 |
+
'fetch_result': None,
|
| 378 |
+
'linked_extractions': [],
|
| 379 |
+
'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
validation_result = self.validate_url(url)
|
| 383 |
+
if not validation_result['is_valid']:
|
| 384 |
+
logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
|
| 385 |
+
return {
|
| 386 |
+
'url': url,
|
| 387 |
+
'level': 0,
|
| 388 |
+
'fetch_result': None,
|
| 389 |
+
'linked_extractions': [],
|
| 390 |
+
'note': f"Initial URL validation failed: {validation_result['message']}"
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
return self._fetch_content_recursive(url, max_steps, current_step=0)
|
| 395 |
+
|
| 396 |
+
def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
|
| 397 |
+
"""Recursive helper to fetch content and follow links."""
|
| 398 |
+
|
| 399 |
+
if current_step > max_steps:
|
| 400 |
+
logger.debug(f"Depth limit reached for {url} at level {current_step}.")
|
| 401 |
+
return {
|
| 402 |
+
'url': url,
|
| 403 |
+
'level': current_step,
|
| 404 |
+
'fetch_result': None, # Indicate no fetch happened at this level
|
| 405 |
+
'linked_extractions': [],
|
| 406 |
+
'note': f"Depth limit ({max_steps}) reached."
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
|
| 410 |
+
|
| 411 |
+
# Fetch content for the current URL
|
| 412 |
+
fetch_result = self.fetch_content(url)
|
| 413 |
+
|
| 414 |
+
linked_extractions: List[Dict[str, Any]] = []
|
| 415 |
+
|
| 416 |
+
# Only follow links if fetch was successful, content is HTML, and within depth limit
|
| 417 |
+
if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
|
| 418 |
+
extracted_data = fetch_result['extracted_data']
|
| 419 |
+
links = extracted_data.get('links', []) # Ensure links is a list even if missing
|
| 420 |
+
|
| 421 |
+
logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
|
| 422 |
+
|
| 423 |
+
# Recursively fetch linked content if not at max depth
|
| 424 |
+
if current_step < max_steps:
|
| 425 |
+
for link_info in links:
|
| 426 |
+
linked_url = link_info.get('url')
|
| 427 |
+
if linked_url:
|
| 428 |
+
# Simple check to avoid re-fetching the same URL repeatedly in a chain
|
| 429 |
+
# More sophisticated cycle detection might be needed for complex graphs
|
| 430 |
+
if linked_url != urlparse(url)._replace(fragment='').geturl(): # Avoid self-referencing links ignoring fragment
|
| 431 |
+
# Recursively call for the linked URL
|
| 432 |
+
linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
|
| 433 |
+
linked_extractions.append(linked_result)
|
| 434 |
+
else:
|
| 435 |
+
logger.debug(f"Skipping self-referencing link: {linked_url}")
|
| 436 |
+
linked_extractions.append({
|
| 437 |
+
'url': linked_url,
|
| 438 |
+
'level': current_step + 1,
|
| 439 |
+
'fetch_result': None,
|
| 440 |
+
'linked_extractions': [],
|
| 441 |
+
'note': 'Skipped self-referencing link'
|
| 442 |
+
})
|
| 443 |
+
else:
|
| 444 |
+
linked_extractions.append({
|
| 445 |
+
'url': 'Invalid or missing link',
|
| 446 |
+
'level': current_step + 1,
|
| 447 |
+
'fetch_result': None,
|
| 448 |
+
'linked_extractions': [],
|
| 449 |
+
'note': 'Link URL not found or invalid'
|
| 450 |
+
})
|
| 451 |
+
else:
|
| 452 |
+
logger.info(f"Max depth ({max_steps}) reached. Not following links from {url}.")
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
return {
|
| 456 |
+
'url': url,
|
| 457 |
+
'level': current_step,
|
| 458 |
+
'fetch_result': fetch_result,
|
| 459 |
+
'linked_extractions': linked_extractions,
|
| 460 |
+
'note': f"Processed at level {current_step}"
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
# --- Example Usage ---
|
| 464 |
+
if __name__ == "__main__":
|
| 465 |
+
processor = EnhancedURLProcessor()
|
| 466 |
+
|
| 467 |
+
# --- Test Cases ---
|
| 468 |
+
|
| 469 |
+
# Test with 0 steps (only initial URL)
|
| 470 |
+
print("\n--- Testing with max_steps = 0 ---")
|
| 471 |
+
result_0 = processor.fetch_content_with_depth("https://httpbin.org/html", max_steps=0)
|
| 472 |
+
# print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
|
| 473 |
+
|
| 474 |
+
print(f"Initial URL ({result_0['url']}) fetched at level {result_0['level']}. Success: {result_0['fetch_result'] is not None}")
|
| 475 |
+
print(f"Number of linked extractions: {len(result_0['linked_extractions'])}") # Should be 0
|
| 476 |
+
|
| 477 |
+
# Test with 1 step (initial URL + its direct links)
|
| 478 |
+
# Note: Replace with a real website URL that has internal links for meaningful testing
|
| 479 |
+
# For demonstration, using a placeholder. A real site like a blog post or news article front page is better.
|
| 480 |
+
test_url_with_links = "https://quotes.toscrape.com/" # Example site with links
|
| 481 |
+
print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
|
| 482 |
+
result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
|
| 483 |
+
# print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
|
| 484 |
+
|
| 485 |
+
print(f"Initial URL ({result_1['url']}) fetched at level {result_1['level']}. Success: {result_1['fetch_result'] is not None}")
|
| 486 |
+
print(f"Number of direct links found and processed: {len(result_1['linked_extractions'])}")
|
| 487 |
+
if result_1['linked_extractions']:
|
| 488 |
+
print(f"First linked URL processed at level 1: {result_1['linked_extractions'][0]['url']}")
|
| 489 |
+
print(f"Number of links found on the first linked page: {len(result_1['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=1
|
| 490 |
+
|
| 491 |
+
# Test with 2 steps
|
| 492 |
+
print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
|
| 493 |
+
result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
|
| 494 |
+
# print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
|
| 495 |
+
|
| 496 |
+
print(f"Initial URL ({result_2['url']}) fetched at level {result_2['level']}. Success: {result_2['fetch_result'] is not None}")
|
| 497 |
+
print(f"Number of direct links found and processed (Level 1): {len(result_2['linked_extractions'])}")
|
| 498 |
+
if result_2['linked_extractions']:
|
| 499 |
+
print(f"First linked URL processed at level 1: {result_2['linked_extractions'][0]['url']}")
|
| 500 |
+
print(f"Number of links found on the first linked page and processed (Level 2): {len(result_2['linked_extractions'][0]['linked_extractions'])}")
|
| 501 |
+
if result_2['linked_extractions'][0]['linked_extractions']:
|
| 502 |
+
print(f"First level 2 linked URL: {result_2['linked_extractions'][0]['linked_extractions'][0]['url']}")
|
| 503 |
+
print(f"Number of links found on the first level 2 page: {len(result_2['linked_extractions'][0]['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=2
|
| 504 |
+
|
| 505 |
+
# Test with max_steps = 3 (will go one level deeper than 2)
|
| 506 |
+
# print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
|
| 507 |
+
# result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
|
| 508 |
+
# print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
|
| 509 |
+
# Add similar print statements for result_3 to show levels 1, 2, and 3 counts
|
| 510 |
+
|
| 511 |
+
# Test with invalid max_steps
|
| 512 |
+
print("\n--- Testing with invalid max_steps = 4 ---")
|
| 513 |
+
result_invalid = processor.fetch_content_with_depth("https://example.com", max_steps=4)
|
| 514 |
+
print(f"Result for invalid steps: {result_invalid.get('note')}")
|
| 515 |
+
|
| 516 |
+
# Test with invalid initial URL
|
| 517 |
+
print("\n--- Testing with invalid initial URL ---")
|
| 518 |
+
result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
|
| 519 |
+
print(f"Result for invalid initial URL: {result_invalid_url.get('note')}")
|
| 520 |
+
|
| 521 |
+
# Test with a URL that might fail to fetch
|
| 522 |
+
print("\n--- Testing with a potentially failing URL ---")
|
| 523 |
+
# Use a non-existent subdomain or a port that's unlikely to be open
|
| 524 |
+
failing_url = "http://this-domain-does-not-exist-12345.com/"
|
| 525 |
+
result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
|
| 526 |
+
print(f"Result for failing URL: {result_fail.get('note')}")
|
| 527 |
+
if result_fail.get('fetch_result'):
|
| 528 |
+
print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
|
| 529 |
+
|
| 530 |
class EnhancedFileProcessor:
|
| 531 |
"""Advanced file processing with enhanced content extraction"""
|
| 532 |
def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
|