acecalisto3 commited on
Commit
50921ef
·
verified ·
1 Parent(s): 59e9295

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +231 -39
app2.py CHANGED
@@ -88,13 +88,13 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
88
  directory.mkdir(parents=True, exist_ok=True)
89
 
90
  class EnhancedURLProcessor:
91
- """Advanced URL processing with enhanced content extraction"""
 
92
  def __init__(self):
93
  self.session = requests.Session()
94
  self.timeout = 15 # Extended timeout for larger content
95
  self.max_retries = 3
96
  self.user_agent = UserAgent()
97
-
98
  # Enhanced headers for better site compatibility
99
  self.session.headers.update({
100
  'User-Agent': self.user_agent.random,
@@ -110,7 +110,7 @@ class EnhancedURLProcessor:
110
  'DNT': '1'
111
  })
112
 
113
- def validate_url(self, url: str) -> Dict:
114
  """Enhanced URL validation with detailed feedback"""
115
  try:
116
  if not validators.url(url):
@@ -123,36 +123,47 @@ class EnhancedURLProcessor:
123
  head_response = self.session.head(url, timeout=5)
124
  head_response.raise_for_status()
125
  final_url = head_response.url # Capture potential redirects
 
 
 
126
  except requests.exceptions.RequestException:
127
  # If HEAD fails, try GET as some servers don't support HEAD
128
- response = self.session.get(url, timeout=self.timeout)
129
- response.raise_for_status()
130
- final_url = response.url # Capture potential redirects
 
 
 
 
 
 
 
 
 
131
 
132
  return {
133
  'is_valid': True,
134
  'message': 'URL is valid and accessible',
135
  'details': {
136
  'final_url': final_url,
137
- 'content_type': head_response.headers.get('Content-Type', 'unknown'),
138
- 'server': head_response.headers.get('Server', 'unknown'),
139
- 'size': head_response.headers.get('Content-Length', 'unknown')
140
  }
141
  }
142
  except Exception as e:
143
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
144
 
145
- def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
146
  """Enhanced content fetcher with retry mechanism and complete character extraction"""
147
  try:
148
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
149
-
150
  # Update User-Agent randomly for each request
151
  self.session.headers.update({'User-Agent': self.user_agent.random})
152
-
153
  response = self.session.get(url, timeout=self.timeout)
154
  response.raise_for_status()
155
  final_url = response.url # Capture potential redirects
 
156
 
157
  # Detect encoding
158
  if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
@@ -177,14 +188,13 @@ class EnhancedURLProcessor:
177
  encoding = 'latin-1 (fallback)'
178
  logger.warning(f"Decoding with {encoding} fallback for {url}")
179
 
180
-
181
  # Extract metadata
182
  metadata = {
183
  'original_url': url,
184
  'final_url': final_url,
185
  'timestamp': datetime.now().isoformat(),
186
  'detected_encoding': encoding,
187
- 'content_type': response.headers.get('Content-Type', ''),
188
  'content_length': len(response.content),
189
  'headers': dict(response.headers),
190
  'status_code': response.status_code
@@ -195,7 +205,7 @@ class EnhancedURLProcessor:
195
 
196
  return {
197
  'source': 'url',
198
- 'url': url, # Keep original URL as identifier
199
  'raw_content': raw_content,
200
  'metadata': metadata,
201
  'extracted_data': processed_extraction['data'],
@@ -211,9 +221,9 @@ class EnhancedURLProcessor:
211
  'source': 'url',
212
  'url': url,
213
  'raw_content': None,
214
- 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
215
  'extracted_data': None,
216
- 'processing_notes': f"Failed to fetch content: {str(e)}"
217
  }
218
  except Exception as e:
219
  logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
@@ -221,9 +231,9 @@ class EnhancedURLProcessor:
221
  'source': 'url',
222
  'url': url,
223
  'raw_content': raw_content if 'raw_content' in locals() else None,
224
- 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
225
  'extracted_data': None,
226
- 'processing_notes': f"Unexpected processing error: {str(e)}"
227
  }
228
 
229
  def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
@@ -231,7 +241,6 @@ class EnhancedURLProcessor:
231
  lower_content_type = content_type.lower()
232
  notes = []
233
  extracted_data: Any = None # Use Any to allow different types
234
-
235
  try:
236
  if 'text/html' in lower_content_type:
237
  logger.debug(f"Processing HTML content from {base_url}")
@@ -253,10 +262,8 @@ class EnhancedURLProcessor:
253
  elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
254
  logger.debug(f"Processing XML content from {base_url}")
255
  try:
256
- # Try parsing XML. Convert to a string or a dict representation if needed.
257
- # For simplicity, we'll convert to a readable string representation of the tree.
258
  root = ET.fromstring(content)
259
- # A simple way to represent XML as text
260
  xml_text = ET.tostring(root, encoding='unicode', method='xml')
261
  extracted_data = xml_text # Store as string for now
262
  notes.append("Parsed as XML (text representation)")
@@ -276,17 +283,14 @@ class EnhancedURLProcessor:
276
  logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
277
  extracted_data = content # Store raw content for unknown types
278
  notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
279
-
280
  except Exception as e:
281
  logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
282
  extracted_data = content # Fallback to raw content on error
283
  notes.append(f"Unexpected processing error: {e}. Stored raw text.")
284
-
285
  return {'data': extracted_data, 'notes': notes}
286
 
287
-
288
  def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
289
- """Process HTML content, preserving text, and extracting metadata."""
290
  extracted: Dict[str, Any] = {
291
  'title': None,
292
  'meta_description': None, # Add extraction for meta description
@@ -306,23 +310,33 @@ class EnhancedURLProcessor:
306
  extracted['meta_description'] = meta_desc['content'].strip()
307
 
308
  # Extract and process links (convert relative to absolute)
 
 
309
  for a_tag in soup.find_all('a', href=True):
310
- href = a_tag['href']
311
- text = a_tag.get_text().strip()
312
- try:
313
- absolute_url = urljoin(base_url, href)
314
- extracted['links'].append({'text': text, 'url': absolute_url})
315
- except Exception:
316
- extracted['links'].append({'text': text, 'url': href}) # Keep relative if join fails
 
 
 
 
 
 
 
 
 
317
 
318
 
319
  # Extract all text content (similar to stripped_strings but ensures order)
320
- text_parts = []
321
  # Use a more robust way to get visible text, including handling script/style tags
322
- for script_or_style in soup(["script", "style"]):
 
323
  script_or_style.extract() # Remove script and style tags
324
- text = soup.get_text(separator='\n') # Get text with newlines
325
-
326
  # Clean up whitespace and empty lines
327
  lines = text.splitlines()
328
  cleaned_lines = [line.strip() for line in lines if line.strip()]
@@ -330,11 +344,189 @@ class EnhancedURLProcessor:
330
 
331
  except Exception as e:
332
  logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
333
- extracted['full_text'] = content # Fallback to raw content
 
 
 
 
334
  extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
335
 
336
  return extracted
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  class EnhancedFileProcessor:
339
  """Advanced file processing with enhanced content extraction"""
340
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
 
88
  directory.mkdir(parents=True, exist_ok=True)
89
 
90
  class EnhancedURLProcessor:
91
+ """Advanced URL processing with enhanced content extraction and recursive link following."""
92
+
93
  def __init__(self):
94
  self.session = requests.Session()
95
  self.timeout = 15 # Extended timeout for larger content
96
  self.max_retries = 3
97
  self.user_agent = UserAgent()
 
98
  # Enhanced headers for better site compatibility
99
  self.session.headers.update({
100
  'User-Agent': self.user_agent.random,
 
110
  'DNT': '1'
111
  })
112
 
113
+ def validate_url(self, url: str) -> Dict[str, Any]:
114
  """Enhanced URL validation with detailed feedback"""
115
  try:
116
  if not validators.url(url):
 
123
  head_response = self.session.head(url, timeout=5)
124
  head_response.raise_for_status()
125
  final_url = head_response.url # Capture potential redirects
126
+ content_type = head_response.headers.get('Content-Type', 'unknown')
127
+ server = head_response.headers.get('Server', 'unknown')
128
+ size = head_response.headers.get('Content-Length', 'unknown')
129
  except requests.exceptions.RequestException:
130
  # If HEAD fails, try GET as some servers don't support HEAD
131
+ try:
132
+ response = self.session.get(url, timeout=self.timeout)
133
+ response.raise_for_status()
134
+ final_url = response.url # Capture potential redirects
135
+ content_type = response.headers.get('Content-Type', 'unknown')
136
+ server = response.headers.get('Server', 'unknown')
137
+ size = response.headers.get('Content-Length', 'unknown') # May not be accurate for full content
138
+ except requests.exceptions.RequestException as get_e:
139
+ return {'is_valid': False, 'message': f'URL not accessible after HEAD/GET attempts: {str(get_e)}', 'details': str(get_e)}
140
+ except Exception as get_e:
141
+ return {'is_valid': False, 'message': f'Unexpected error during GET validation: {str(get_e)}', 'details': str(get_e)}
142
+
143
 
144
  return {
145
  'is_valid': True,
146
  'message': 'URL is valid and accessible',
147
  'details': {
148
  'final_url': final_url,
149
+ 'content_type': content_type,
150
+ 'server': server,
151
+ 'size': size
152
  }
153
  }
154
  except Exception as e:
155
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
156
 
157
+ def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
158
  """Enhanced content fetcher with retry mechanism and complete character extraction"""
159
  try:
160
  logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
 
161
  # Update User-Agent randomly for each request
162
  self.session.headers.update({'User-Agent': self.user_agent.random})
 
163
  response = self.session.get(url, timeout=self.timeout)
164
  response.raise_for_status()
165
  final_url = response.url # Capture potential redirects
166
+ content_type = response.headers.get('Content-Type', '')
167
 
168
  # Detect encoding
169
  if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
 
188
  encoding = 'latin-1 (fallback)'
189
  logger.warning(f"Decoding with {encoding} fallback for {url}")
190
 
 
191
  # Extract metadata
192
  metadata = {
193
  'original_url': url,
194
  'final_url': final_url,
195
  'timestamp': datetime.now().isoformat(),
196
  'detected_encoding': encoding,
197
+ 'content_type': content_type,
198
  'content_length': len(response.content),
199
  'headers': dict(response.headers),
200
  'status_code': response.status_code
 
205
 
206
  return {
207
  'source': 'url',
208
+ 'url': url, # Keep original URL as identifier for this step
209
  'raw_content': raw_content,
210
  'metadata': metadata,
211
  'extracted_data': processed_extraction['data'],
 
221
  'source': 'url',
222
  'url': url,
223
  'raw_content': None,
224
+ 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
225
  'extracted_data': None,
226
+ 'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
227
  }
228
  except Exception as e:
229
  logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
 
231
  'source': 'url',
232
  'url': url,
233
  'raw_content': raw_content if 'raw_content' in locals() else None,
234
+ 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
235
  'extracted_data': None,
236
+ 'processing_notes': [f"Unexpected processing error: {str(e)}"]
237
  }
238
 
239
  def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
 
241
  lower_content_type = content_type.lower()
242
  notes = []
243
  extracted_data: Any = None # Use Any to allow different types
 
244
  try:
245
  if 'text/html' in lower_content_type:
246
  logger.debug(f"Processing HTML content from {base_url}")
 
262
  elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
263
  logger.debug(f"Processing XML content from {base_url}")
264
  try:
265
+ # Try parsing XML. Convert to a string representation.
 
266
  root = ET.fromstring(content)
 
267
  xml_text = ET.tostring(root, encoding='unicode', method='xml')
268
  extracted_data = xml_text # Store as string for now
269
  notes.append("Parsed as XML (text representation)")
 
283
  logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
284
  extracted_data = content # Store raw content for unknown types
285
  notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
 
286
  except Exception as e:
287
  logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
288
  extracted_data = content # Fallback to raw content on error
289
  notes.append(f"Unexpected processing error: {e}. Stored raw text.")
 
290
  return {'data': extracted_data, 'notes': notes}
291
 
 
292
  def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
293
+ """Process HTML content, preserving text, and extracting metadata and links."""
294
  extracted: Dict[str, Any] = {
295
  'title': None,
296
  'meta_description': None, # Add extraction for meta description
 
310
  extracted['meta_description'] = meta_desc['content'].strip()
311
 
312
  # Extract and process links (convert relative to absolute)
313
+ # Use a set to avoid duplicate URLs in the links list
314
+ unique_links = set()
315
  for a_tag in soup.find_all('a', href=True):
316
+ href = a_tag['href'].strip()
317
+ if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): # Basic filter
318
+ text = a_tag.get_text().strip()
319
+ try:
320
+ absolute_url = urljoin(base_url, href)
321
+ if absolute_url not in unique_links:
322
+ extracted['links'].append({'text': text, 'url': absolute_url})
323
+ unique_links.add(absolute_url)
324
+ except Exception:
325
+ # If urljoin fails, keep the original href if it looks like a valid potential URL part
326
+ if validators.url(href) and href not in unique_links:
327
+ extracted['links'].append({'text': text, 'url': href})
328
+ unique_links.add(href)
329
+ elif urlparse(href).netloc and href not in unique_links: # Maybe just a domain/path?
330
+ extracted['links'].append({'text': text, 'url': href})
331
+ unique_links.add(href)
332
 
333
 
334
  # Extract all text content (similar to stripped_strings but ensures order)
 
335
  # Use a more robust way to get visible text, including handling script/style tags
336
+ soup_copy = BeautifulSoup(content, 'html.parser') # Work on a copy to preserve soup for links
337
+ for script_or_style in soup_copy(["script", "style"]):
338
  script_or_style.extract() # Remove script and style tags
339
+ text = soup_copy.get_text(separator='\n') # Get text with newlines
 
340
  # Clean up whitespace and empty lines
341
  lines = text.splitlines()
342
  cleaned_lines = [line.strip() for line in lines if line.strip()]
 
344
 
345
  except Exception as e:
346
  logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
347
+ # Fallback: Store raw text and indicate error
348
+ soup_copy = BeautifulSoup(content, 'html.parser')
349
+ for script_or_style in soup_copy(["script", "style"]):
350
+ script_or_style.extract()
351
+ extracted['full_text'] = soup_copy.get_text(separator='\n').strip()
352
  extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
353
 
354
  return extracted
355
 
356
+ def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
357
+ """
358
+ Fetches content from a URL and recursively follows links up to a specified depth.
359
+
360
+ Args:
361
+ url: The initial URL to fetch.
362
+ max_steps: The maximum number of levels to follow links (0-3).
363
+ 0: Only fetch the initial URL.
364
+ 1: Fetch the initial URL and the links found on that page.
365
+ 2: Fetch the initial URL, its links, and the links on those pages.
366
+ 3: Fetch up to the third level of links.
367
+
368
+ Returns:
369
+ A dictionary containing the extraction result for the initial URL and
370
+ nested results for followed links.
371
+ """
372
+ if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
373
+ logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
374
+ return {
375
+ 'url': url,
376
+ 'level': 0,
377
+ 'fetch_result': None,
378
+ 'linked_extractions': [],
379
+ 'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
380
+ }
381
+
382
+ validation_result = self.validate_url(url)
383
+ if not validation_result['is_valid']:
384
+ logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
385
+ return {
386
+ 'url': url,
387
+ 'level': 0,
388
+ 'fetch_result': None,
389
+ 'linked_extractions': [],
390
+ 'note': f"Initial URL validation failed: {validation_result['message']}"
391
+ }
392
+
393
+
394
+ return self._fetch_content_recursive(url, max_steps, current_step=0)
395
+
396
+ def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
397
+ """Recursive helper to fetch content and follow links."""
398
+
399
+ if current_step > max_steps:
400
+ logger.debug(f"Depth limit reached for {url} at level {current_step}.")
401
+ return {
402
+ 'url': url,
403
+ 'level': current_step,
404
+ 'fetch_result': None, # Indicate no fetch happened at this level
405
+ 'linked_extractions': [],
406
+ 'note': f"Depth limit ({max_steps}) reached."
407
+ }
408
+
409
+ logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
410
+
411
+ # Fetch content for the current URL
412
+ fetch_result = self.fetch_content(url)
413
+
414
+ linked_extractions: List[Dict[str, Any]] = []
415
+
416
+ # Only follow links if fetch was successful, content is HTML, and within depth limit
417
+ if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
418
+ extracted_data = fetch_result['extracted_data']
419
+ links = extracted_data.get('links', []) # Ensure links is a list even if missing
420
+
421
+ logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
422
+
423
+ # Recursively fetch linked content if not at max depth
424
+ if current_step < max_steps:
425
+ for link_info in links:
426
+ linked_url = link_info.get('url')
427
+ if linked_url:
428
+ # Simple check to avoid re-fetching the same URL repeatedly in a chain
429
+ # More sophisticated cycle detection might be needed for complex graphs
430
+ if linked_url != urlparse(url)._replace(fragment='').geturl(): # Avoid self-referencing links ignoring fragment
431
+ # Recursively call for the linked URL
432
+ linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
433
+ linked_extractions.append(linked_result)
434
+ else:
435
+ logger.debug(f"Skipping self-referencing link: {linked_url}")
436
+ linked_extractions.append({
437
+ 'url': linked_url,
438
+ 'level': current_step + 1,
439
+ 'fetch_result': None,
440
+ 'linked_extractions': [],
441
+ 'note': 'Skipped self-referencing link'
442
+ })
443
+ else:
444
+ linked_extractions.append({
445
+ 'url': 'Invalid or missing link',
446
+ 'level': current_step + 1,
447
+ 'fetch_result': None,
448
+ 'linked_extractions': [],
449
+ 'note': 'Link URL not found or invalid'
450
+ })
451
+ else:
452
+ logger.info(f"Max depth ({max_steps}) reached. Not following links from {url}.")
453
+
454
+
455
+ return {
456
+ 'url': url,
457
+ 'level': current_step,
458
+ 'fetch_result': fetch_result,
459
+ 'linked_extractions': linked_extractions,
460
+ 'note': f"Processed at level {current_step}"
461
+ }
462
+
463
+ # --- Example Usage ---
464
+ if __name__ == "__main__":
465
+ processor = EnhancedURLProcessor()
466
+
467
+ # --- Test Cases ---
468
+
469
+ # Test with 0 steps (only initial URL)
470
+ print("\n--- Testing with max_steps = 0 ---")
471
+ result_0 = processor.fetch_content_with_depth("https://httpbin.org/html", max_steps=0)
472
+ # print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
473
+
474
+ print(f"Initial URL ({result_0['url']}) fetched at level {result_0['level']}. Success: {result_0['fetch_result'] is not None}")
475
+ print(f"Number of linked extractions: {len(result_0['linked_extractions'])}") # Should be 0
476
+
477
+ # Test with 1 step (initial URL + its direct links)
478
+ # Note: Replace with a real website URL that has internal links for meaningful testing
479
+ # For demonstration, using a placeholder. A real site like a blog post or news article front page is better.
480
+ test_url_with_links = "https://quotes.toscrape.com/" # Example site with links
481
+ print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
482
+ result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
483
+ # print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
484
+
485
+ print(f"Initial URL ({result_1['url']}) fetched at level {result_1['level']}. Success: {result_1['fetch_result'] is not None}")
486
+ print(f"Number of direct links found and processed: {len(result_1['linked_extractions'])}")
487
+ if result_1['linked_extractions']:
488
+ print(f"First linked URL processed at level 1: {result_1['linked_extractions'][0]['url']}")
489
+ print(f"Number of links found on the first linked page: {len(result_1['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=1
490
+
491
+ # Test with 2 steps
492
+ print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
493
+ result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
494
+ # print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
495
+
496
+ print(f"Initial URL ({result_2['url']}) fetched at level {result_2['level']}. Success: {result_2['fetch_result'] is not None}")
497
+ print(f"Number of direct links found and processed (Level 1): {len(result_2['linked_extractions'])}")
498
+ if result_2['linked_extractions']:
499
+ print(f"First linked URL processed at level 1: {result_2['linked_extractions'][0]['url']}")
500
+ print(f"Number of links found on the first linked page and processed (Level 2): {len(result_2['linked_extractions'][0]['linked_extractions'])}")
501
+ if result_2['linked_extractions'][0]['linked_extractions']:
502
+ print(f"First level 2 linked URL: {result_2['linked_extractions'][0]['linked_extractions'][0]['url']}")
503
+ print(f"Number of links found on the first level 2 page: {len(result_2['linked_extractions'][0]['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=2
504
+
505
+ # Test with max_steps = 3 (will go one level deeper than 2)
506
+ # print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
507
+ # result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
508
+ # print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
509
+ # Add similar print statements for result_3 to show levels 1, 2, and 3 counts
510
+
511
+ # Test with invalid max_steps
512
+ print("\n--- Testing with invalid max_steps = 4 ---")
513
+ result_invalid = processor.fetch_content_with_depth("https://example.com", max_steps=4)
514
+ print(f"Result for invalid steps: {result_invalid.get('note')}")
515
+
516
+ # Test with invalid initial URL
517
+ print("\n--- Testing with invalid initial URL ---")
518
+ result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
519
+ print(f"Result for invalid initial URL: {result_invalid_url.get('note')}")
520
+
521
+ # Test with a URL that might fail to fetch
522
+ print("\n--- Testing with a potentially failing URL ---")
523
+ # Use a non-existent subdomain or a port that's unlikely to be open
524
+ failing_url = "http://this-domain-does-not-exist-12345.com/"
525
+ result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
526
+ print(f"Result for failing URL: {result_fail.get('note')}")
527
+ if result_fail.get('fetch_result'):
528
+ print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
529
+
530
  class EnhancedFileProcessor:
531
  """Advanced file processing with enhanced content extraction"""
532
  def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default