Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -88,13 +88,13 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
|
|
88 |
directory.mkdir(parents=True, exist_ok=True)
|
89 |
|
90 |
class EnhancedURLProcessor:
|
91 |
-
"""Advanced URL processing with enhanced content extraction"""
|
|
|
92 |
def __init__(self):
|
93 |
self.session = requests.Session()
|
94 |
self.timeout = 15 # Extended timeout for larger content
|
95 |
self.max_retries = 3
|
96 |
self.user_agent = UserAgent()
|
97 |
-
|
98 |
# Enhanced headers for better site compatibility
|
99 |
self.session.headers.update({
|
100 |
'User-Agent': self.user_agent.random,
|
@@ -110,7 +110,7 @@ class EnhancedURLProcessor:
|
|
110 |
'DNT': '1'
|
111 |
})
|
112 |
|
113 |
-
def validate_url(self, url: str) -> Dict:
|
114 |
"""Enhanced URL validation with detailed feedback"""
|
115 |
try:
|
116 |
if not validators.url(url):
|
@@ -123,36 +123,47 @@ class EnhancedURLProcessor:
|
|
123 |
head_response = self.session.head(url, timeout=5)
|
124 |
head_response.raise_for_status()
|
125 |
final_url = head_response.url # Capture potential redirects
|
|
|
|
|
|
|
126 |
except requests.exceptions.RequestException:
|
127 |
# If HEAD fails, try GET as some servers don't support HEAD
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
return {
|
133 |
'is_valid': True,
|
134 |
'message': 'URL is valid and accessible',
|
135 |
'details': {
|
136 |
'final_url': final_url,
|
137 |
-
'content_type':
|
138 |
-
'server':
|
139 |
-
'size':
|
140 |
}
|
141 |
}
|
142 |
except Exception as e:
|
143 |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
|
144 |
|
145 |
-
def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
|
146 |
"""Enhanced content fetcher with retry mechanism and complete character extraction"""
|
147 |
try:
|
148 |
logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
|
149 |
-
|
150 |
# Update User-Agent randomly for each request
|
151 |
self.session.headers.update({'User-Agent': self.user_agent.random})
|
152 |
-
|
153 |
response = self.session.get(url, timeout=self.timeout)
|
154 |
response.raise_for_status()
|
155 |
final_url = response.url # Capture potential redirects
|
|
|
156 |
|
157 |
# Detect encoding
|
158 |
if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
|
@@ -177,14 +188,13 @@ class EnhancedURLProcessor:
|
|
177 |
encoding = 'latin-1 (fallback)'
|
178 |
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
179 |
|
180 |
-
|
181 |
# Extract metadata
|
182 |
metadata = {
|
183 |
'original_url': url,
|
184 |
'final_url': final_url,
|
185 |
'timestamp': datetime.now().isoformat(),
|
186 |
'detected_encoding': encoding,
|
187 |
-
'content_type':
|
188 |
'content_length': len(response.content),
|
189 |
'headers': dict(response.headers),
|
190 |
'status_code': response.status_code
|
@@ -195,7 +205,7 @@ class EnhancedURLProcessor:
|
|
195 |
|
196 |
return {
|
197 |
'source': 'url',
|
198 |
-
'url': url, # Keep original URL as identifier
|
199 |
'raw_content': raw_content,
|
200 |
'metadata': metadata,
|
201 |
'extracted_data': processed_extraction['data'],
|
@@ -211,9 +221,9 @@ class EnhancedURLProcessor:
|
|
211 |
'source': 'url',
|
212 |
'url': url,
|
213 |
'raw_content': None,
|
214 |
-
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
|
215 |
'extracted_data': None,
|
216 |
-
'processing_notes': f"Failed to fetch content: {str(e)}"
|
217 |
}
|
218 |
except Exception as e:
|
219 |
logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
|
@@ -221,9 +231,9 @@ class EnhancedURLProcessor:
|
|
221 |
'source': 'url',
|
222 |
'url': url,
|
223 |
'raw_content': raw_content if 'raw_content' in locals() else None,
|
224 |
-
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
|
225 |
'extracted_data': None,
|
226 |
-
'processing_notes': f"Unexpected processing error: {str(e)}"
|
227 |
}
|
228 |
|
229 |
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
@@ -231,7 +241,6 @@ class EnhancedURLProcessor:
|
|
231 |
lower_content_type = content_type.lower()
|
232 |
notes = []
|
233 |
extracted_data: Any = None # Use Any to allow different types
|
234 |
-
|
235 |
try:
|
236 |
if 'text/html' in lower_content_type:
|
237 |
logger.debug(f"Processing HTML content from {base_url}")
|
@@ -253,10 +262,8 @@ class EnhancedURLProcessor:
|
|
253 |
elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
|
254 |
logger.debug(f"Processing XML content from {base_url}")
|
255 |
try:
|
256 |
-
# Try parsing XML. Convert to a string
|
257 |
-
# For simplicity, we'll convert to a readable string representation of the tree.
|
258 |
root = ET.fromstring(content)
|
259 |
-
# A simple way to represent XML as text
|
260 |
xml_text = ET.tostring(root, encoding='unicode', method='xml')
|
261 |
extracted_data = xml_text # Store as string for now
|
262 |
notes.append("Parsed as XML (text representation)")
|
@@ -276,17 +283,14 @@ class EnhancedURLProcessor:
|
|
276 |
logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
|
277 |
extracted_data = content # Store raw content for unknown types
|
278 |
notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
|
279 |
-
|
280 |
except Exception as e:
|
281 |
logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
|
282 |
extracted_data = content # Fallback to raw content on error
|
283 |
notes.append(f"Unexpected processing error: {e}. Stored raw text.")
|
284 |
-
|
285 |
return {'data': extracted_data, 'notes': notes}
|
286 |
|
287 |
-
|
288 |
def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
|
289 |
-
"""Process HTML content, preserving text, and extracting metadata."""
|
290 |
extracted: Dict[str, Any] = {
|
291 |
'title': None,
|
292 |
'meta_description': None, # Add extraction for meta description
|
@@ -306,23 +310,33 @@ class EnhancedURLProcessor:
|
|
306 |
extracted['meta_description'] = meta_desc['content'].strip()
|
307 |
|
308 |
# Extract and process links (convert relative to absolute)
|
|
|
|
|
309 |
for a_tag in soup.find_all('a', href=True):
|
310 |
-
href = a_tag['href']
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
|
319 |
# Extract all text content (similar to stripped_strings but ensures order)
|
320 |
-
text_parts = []
|
321 |
# Use a more robust way to get visible text, including handling script/style tags
|
322 |
-
|
|
|
323 |
script_or_style.extract() # Remove script and style tags
|
324 |
-
text =
|
325 |
-
|
326 |
# Clean up whitespace and empty lines
|
327 |
lines = text.splitlines()
|
328 |
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
@@ -330,11 +344,189 @@ class EnhancedURLProcessor:
|
|
330 |
|
331 |
except Exception as e:
|
332 |
logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
|
333 |
-
|
|
|
|
|
|
|
|
|
334 |
extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
|
335 |
|
336 |
return extracted
|
337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
class EnhancedFileProcessor:
|
339 |
"""Advanced file processing with enhanced content extraction"""
|
340 |
def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
|
|
|
88 |
directory.mkdir(parents=True, exist_ok=True)
|
89 |
|
90 |
class EnhancedURLProcessor:
|
91 |
+
"""Advanced URL processing with enhanced content extraction and recursive link following."""
|
92 |
+
|
93 |
def __init__(self):
|
94 |
self.session = requests.Session()
|
95 |
self.timeout = 15 # Extended timeout for larger content
|
96 |
self.max_retries = 3
|
97 |
self.user_agent = UserAgent()
|
|
|
98 |
# Enhanced headers for better site compatibility
|
99 |
self.session.headers.update({
|
100 |
'User-Agent': self.user_agent.random,
|
|
|
110 |
'DNT': '1'
|
111 |
})
|
112 |
|
113 |
+
def validate_url(self, url: str) -> Dict[str, Any]:
|
114 |
"""Enhanced URL validation with detailed feedback"""
|
115 |
try:
|
116 |
if not validators.url(url):
|
|
|
123 |
head_response = self.session.head(url, timeout=5)
|
124 |
head_response.raise_for_status()
|
125 |
final_url = head_response.url # Capture potential redirects
|
126 |
+
content_type = head_response.headers.get('Content-Type', 'unknown')
|
127 |
+
server = head_response.headers.get('Server', 'unknown')
|
128 |
+
size = head_response.headers.get('Content-Length', 'unknown')
|
129 |
except requests.exceptions.RequestException:
|
130 |
# If HEAD fails, try GET as some servers don't support HEAD
|
131 |
+
try:
|
132 |
+
response = self.session.get(url, timeout=self.timeout)
|
133 |
+
response.raise_for_status()
|
134 |
+
final_url = response.url # Capture potential redirects
|
135 |
+
content_type = response.headers.get('Content-Type', 'unknown')
|
136 |
+
server = response.headers.get('Server', 'unknown')
|
137 |
+
size = response.headers.get('Content-Length', 'unknown') # May not be accurate for full content
|
138 |
+
except requests.exceptions.RequestException as get_e:
|
139 |
+
return {'is_valid': False, 'message': f'URL not accessible after HEAD/GET attempts: {str(get_e)}', 'details': str(get_e)}
|
140 |
+
except Exception as get_e:
|
141 |
+
return {'is_valid': False, 'message': f'Unexpected error during GET validation: {str(get_e)}', 'details': str(get_e)}
|
142 |
+
|
143 |
|
144 |
return {
|
145 |
'is_valid': True,
|
146 |
'message': 'URL is valid and accessible',
|
147 |
'details': {
|
148 |
'final_url': final_url,
|
149 |
+
'content_type': content_type,
|
150 |
+
'server': server,
|
151 |
+
'size': size
|
152 |
}
|
153 |
}
|
154 |
except Exception as e:
|
155 |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
|
156 |
|
157 |
+
def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
|
158 |
"""Enhanced content fetcher with retry mechanism and complete character extraction"""
|
159 |
try:
|
160 |
logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
|
|
|
161 |
# Update User-Agent randomly for each request
|
162 |
self.session.headers.update({'User-Agent': self.user_agent.random})
|
|
|
163 |
response = self.session.get(url, timeout=self.timeout)
|
164 |
response.raise_for_status()
|
165 |
final_url = response.url # Capture potential redirects
|
166 |
+
content_type = response.headers.get('Content-Type', '')
|
167 |
|
168 |
# Detect encoding
|
169 |
if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
|
|
|
188 |
encoding = 'latin-1 (fallback)'
|
189 |
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
190 |
|
|
|
191 |
# Extract metadata
|
192 |
metadata = {
|
193 |
'original_url': url,
|
194 |
'final_url': final_url,
|
195 |
'timestamp': datetime.now().isoformat(),
|
196 |
'detected_encoding': encoding,
|
197 |
+
'content_type': content_type,
|
198 |
'content_length': len(response.content),
|
199 |
'headers': dict(response.headers),
|
200 |
'status_code': response.status_code
|
|
|
205 |
|
206 |
return {
|
207 |
'source': 'url',
|
208 |
+
'url': url, # Keep original URL as identifier for this step
|
209 |
'raw_content': raw_content,
|
210 |
'metadata': metadata,
|
211 |
'extracted_data': processed_extraction['data'],
|
|
|
221 |
'source': 'url',
|
222 |
'url': url,
|
223 |
'raw_content': None,
|
224 |
+
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
|
225 |
'extracted_data': None,
|
226 |
+
'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
|
227 |
}
|
228 |
except Exception as e:
|
229 |
logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
|
|
|
231 |
'source': 'url',
|
232 |
'url': url,
|
233 |
'raw_content': raw_content if 'raw_content' in locals() else None,
|
234 |
+
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
|
235 |
'extracted_data': None,
|
236 |
+
'processing_notes': [f"Unexpected processing error: {str(e)}"]
|
237 |
}
|
238 |
|
239 |
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
|
|
241 |
lower_content_type = content_type.lower()
|
242 |
notes = []
|
243 |
extracted_data: Any = None # Use Any to allow different types
|
|
|
244 |
try:
|
245 |
if 'text/html' in lower_content_type:
|
246 |
logger.debug(f"Processing HTML content from {base_url}")
|
|
|
262 |
elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
|
263 |
logger.debug(f"Processing XML content from {base_url}")
|
264 |
try:
|
265 |
+
# Try parsing XML. Convert to a string representation.
|
|
|
266 |
root = ET.fromstring(content)
|
|
|
267 |
xml_text = ET.tostring(root, encoding='unicode', method='xml')
|
268 |
extracted_data = xml_text # Store as string for now
|
269 |
notes.append("Parsed as XML (text representation)")
|
|
|
283 |
logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
|
284 |
extracted_data = content # Store raw content for unknown types
|
285 |
notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
|
|
|
286 |
except Exception as e:
|
287 |
logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
|
288 |
extracted_data = content # Fallback to raw content on error
|
289 |
notes.append(f"Unexpected processing error: {e}. Stored raw text.")
|
|
|
290 |
return {'data': extracted_data, 'notes': notes}
|
291 |
|
|
|
292 |
def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
|
293 |
+
"""Process HTML content, preserving text, and extracting metadata and links."""
|
294 |
extracted: Dict[str, Any] = {
|
295 |
'title': None,
|
296 |
'meta_description': None, # Add extraction for meta description
|
|
|
310 |
extracted['meta_description'] = meta_desc['content'].strip()
|
311 |
|
312 |
# Extract and process links (convert relative to absolute)
|
313 |
+
# Use a set to avoid duplicate URLs in the links list
|
314 |
+
unique_links = set()
|
315 |
for a_tag in soup.find_all('a', href=True):
|
316 |
+
href = a_tag['href'].strip()
|
317 |
+
if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): # Basic filter
|
318 |
+
text = a_tag.get_text().strip()
|
319 |
+
try:
|
320 |
+
absolute_url = urljoin(base_url, href)
|
321 |
+
if absolute_url not in unique_links:
|
322 |
+
extracted['links'].append({'text': text, 'url': absolute_url})
|
323 |
+
unique_links.add(absolute_url)
|
324 |
+
except Exception:
|
325 |
+
# If urljoin fails, keep the original href if it looks like a valid potential URL part
|
326 |
+
if validators.url(href) and href not in unique_links:
|
327 |
+
extracted['links'].append({'text': text, 'url': href})
|
328 |
+
unique_links.add(href)
|
329 |
+
elif urlparse(href).netloc and href not in unique_links: # Maybe just a domain/path?
|
330 |
+
extracted['links'].append({'text': text, 'url': href})
|
331 |
+
unique_links.add(href)
|
332 |
|
333 |
|
334 |
# Extract all text content (similar to stripped_strings but ensures order)
|
|
|
335 |
# Use a more robust way to get visible text, including handling script/style tags
|
336 |
+
soup_copy = BeautifulSoup(content, 'html.parser') # Work on a copy to preserve soup for links
|
337 |
+
for script_or_style in soup_copy(["script", "style"]):
|
338 |
script_or_style.extract() # Remove script and style tags
|
339 |
+
text = soup_copy.get_text(separator='\n') # Get text with newlines
|
|
|
340 |
# Clean up whitespace and empty lines
|
341 |
lines = text.splitlines()
|
342 |
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
|
|
344 |
|
345 |
except Exception as e:
|
346 |
logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
|
347 |
+
# Fallback: Store raw text and indicate error
|
348 |
+
soup_copy = BeautifulSoup(content, 'html.parser')
|
349 |
+
for script_or_style in soup_copy(["script", "style"]):
|
350 |
+
script_or_style.extract()
|
351 |
+
extracted['full_text'] = soup_copy.get_text(separator='\n').strip()
|
352 |
extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
|
353 |
|
354 |
return extracted
|
355 |
|
356 |
+
def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
|
357 |
+
"""
|
358 |
+
Fetches content from a URL and recursively follows links up to a specified depth.
|
359 |
+
|
360 |
+
Args:
|
361 |
+
url: The initial URL to fetch.
|
362 |
+
max_steps: The maximum number of levels to follow links (0-3).
|
363 |
+
0: Only fetch the initial URL.
|
364 |
+
1: Fetch the initial URL and the links found on that page.
|
365 |
+
2: Fetch the initial URL, its links, and the links on those pages.
|
366 |
+
3: Fetch up to the third level of links.
|
367 |
+
|
368 |
+
Returns:
|
369 |
+
A dictionary containing the extraction result for the initial URL and
|
370 |
+
nested results for followed links.
|
371 |
+
"""
|
372 |
+
if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
|
373 |
+
logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
|
374 |
+
return {
|
375 |
+
'url': url,
|
376 |
+
'level': 0,
|
377 |
+
'fetch_result': None,
|
378 |
+
'linked_extractions': [],
|
379 |
+
'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
|
380 |
+
}
|
381 |
+
|
382 |
+
validation_result = self.validate_url(url)
|
383 |
+
if not validation_result['is_valid']:
|
384 |
+
logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
|
385 |
+
return {
|
386 |
+
'url': url,
|
387 |
+
'level': 0,
|
388 |
+
'fetch_result': None,
|
389 |
+
'linked_extractions': [],
|
390 |
+
'note': f"Initial URL validation failed: {validation_result['message']}"
|
391 |
+
}
|
392 |
+
|
393 |
+
|
394 |
+
return self._fetch_content_recursive(url, max_steps, current_step=0)
|
395 |
+
|
396 |
+
def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
|
397 |
+
"""Recursive helper to fetch content and follow links."""
|
398 |
+
|
399 |
+
if current_step > max_steps:
|
400 |
+
logger.debug(f"Depth limit reached for {url} at level {current_step}.")
|
401 |
+
return {
|
402 |
+
'url': url,
|
403 |
+
'level': current_step,
|
404 |
+
'fetch_result': None, # Indicate no fetch happened at this level
|
405 |
+
'linked_extractions': [],
|
406 |
+
'note': f"Depth limit ({max_steps}) reached."
|
407 |
+
}
|
408 |
+
|
409 |
+
logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
|
410 |
+
|
411 |
+
# Fetch content for the current URL
|
412 |
+
fetch_result = self.fetch_content(url)
|
413 |
+
|
414 |
+
linked_extractions: List[Dict[str, Any]] = []
|
415 |
+
|
416 |
+
# Only follow links if fetch was successful, content is HTML, and within depth limit
|
417 |
+
if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
|
418 |
+
extracted_data = fetch_result['extracted_data']
|
419 |
+
links = extracted_data.get('links', []) # Ensure links is a list even if missing
|
420 |
+
|
421 |
+
logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
|
422 |
+
|
423 |
+
# Recursively fetch linked content if not at max depth
|
424 |
+
if current_step < max_steps:
|
425 |
+
for link_info in links:
|
426 |
+
linked_url = link_info.get('url')
|
427 |
+
if linked_url:
|
428 |
+
# Simple check to avoid re-fetching the same URL repeatedly in a chain
|
429 |
+
# More sophisticated cycle detection might be needed for complex graphs
|
430 |
+
if linked_url != urlparse(url)._replace(fragment='').geturl(): # Avoid self-referencing links ignoring fragment
|
431 |
+
# Recursively call for the linked URL
|
432 |
+
linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
|
433 |
+
linked_extractions.append(linked_result)
|
434 |
+
else:
|
435 |
+
logger.debug(f"Skipping self-referencing link: {linked_url}")
|
436 |
+
linked_extractions.append({
|
437 |
+
'url': linked_url,
|
438 |
+
'level': current_step + 1,
|
439 |
+
'fetch_result': None,
|
440 |
+
'linked_extractions': [],
|
441 |
+
'note': 'Skipped self-referencing link'
|
442 |
+
})
|
443 |
+
else:
|
444 |
+
linked_extractions.append({
|
445 |
+
'url': 'Invalid or missing link',
|
446 |
+
'level': current_step + 1,
|
447 |
+
'fetch_result': None,
|
448 |
+
'linked_extractions': [],
|
449 |
+
'note': 'Link URL not found or invalid'
|
450 |
+
})
|
451 |
+
else:
|
452 |
+
logger.info(f"Max depth ({max_steps}) reached. Not following links from {url}.")
|
453 |
+
|
454 |
+
|
455 |
+
return {
|
456 |
+
'url': url,
|
457 |
+
'level': current_step,
|
458 |
+
'fetch_result': fetch_result,
|
459 |
+
'linked_extractions': linked_extractions,
|
460 |
+
'note': f"Processed at level {current_step}"
|
461 |
+
}
|
462 |
+
|
463 |
+
# --- Example Usage ---
|
464 |
+
if __name__ == "__main__":
|
465 |
+
processor = EnhancedURLProcessor()
|
466 |
+
|
467 |
+
# --- Test Cases ---
|
468 |
+
|
469 |
+
# Test with 0 steps (only initial URL)
|
470 |
+
print("\n--- Testing with max_steps = 0 ---")
|
471 |
+
result_0 = processor.fetch_content_with_depth("https://httpbin.org/html", max_steps=0)
|
472 |
+
# print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
|
473 |
+
|
474 |
+
print(f"Initial URL ({result_0['url']}) fetched at level {result_0['level']}. Success: {result_0['fetch_result'] is not None}")
|
475 |
+
print(f"Number of linked extractions: {len(result_0['linked_extractions'])}") # Should be 0
|
476 |
+
|
477 |
+
# Test with 1 step (initial URL + its direct links)
|
478 |
+
# Note: Replace with a real website URL that has internal links for meaningful testing
|
479 |
+
# For demonstration, using a placeholder. A real site like a blog post or news article front page is better.
|
480 |
+
test_url_with_links = "https://quotes.toscrape.com/" # Example site with links
|
481 |
+
print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
|
482 |
+
result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
|
483 |
+
# print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
|
484 |
+
|
485 |
+
print(f"Initial URL ({result_1['url']}) fetched at level {result_1['level']}. Success: {result_1['fetch_result'] is not None}")
|
486 |
+
print(f"Number of direct links found and processed: {len(result_1['linked_extractions'])}")
|
487 |
+
if result_1['linked_extractions']:
|
488 |
+
print(f"First linked URL processed at level 1: {result_1['linked_extractions'][0]['url']}")
|
489 |
+
print(f"Number of links found on the first linked page: {len(result_1['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=1
|
490 |
+
|
491 |
+
# Test with 2 steps
|
492 |
+
print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
|
493 |
+
result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
|
494 |
+
# print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
|
495 |
+
|
496 |
+
print(f"Initial URL ({result_2['url']}) fetched at level {result_2['level']}. Success: {result_2['fetch_result'] is not None}")
|
497 |
+
print(f"Number of direct links found and processed (Level 1): {len(result_2['linked_extractions'])}")
|
498 |
+
if result_2['linked_extractions']:
|
499 |
+
print(f"First linked URL processed at level 1: {result_2['linked_extractions'][0]['url']}")
|
500 |
+
print(f"Number of links found on the first linked page and processed (Level 2): {len(result_2['linked_extractions'][0]['linked_extractions'])}")
|
501 |
+
if result_2['linked_extractions'][0]['linked_extractions']:
|
502 |
+
print(f"First level 2 linked URL: {result_2['linked_extractions'][0]['linked_extractions'][0]['url']}")
|
503 |
+
print(f"Number of links found on the first level 2 page: {len(result_2['linked_extractions'][0]['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=2
|
504 |
+
|
505 |
+
# Test with max_steps = 3 (will go one level deeper than 2)
|
506 |
+
# print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
|
507 |
+
# result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
|
508 |
+
# print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
|
509 |
+
# Add similar print statements for result_3 to show levels 1, 2, and 3 counts
|
510 |
+
|
511 |
+
# Test with invalid max_steps
|
512 |
+
print("\n--- Testing with invalid max_steps = 4 ---")
|
513 |
+
result_invalid = processor.fetch_content_with_depth("https://example.com", max_steps=4)
|
514 |
+
print(f"Result for invalid steps: {result_invalid.get('note')}")
|
515 |
+
|
516 |
+
# Test with invalid initial URL
|
517 |
+
print("\n--- Testing with invalid initial URL ---")
|
518 |
+
result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
|
519 |
+
print(f"Result for invalid initial URL: {result_invalid_url.get('note')}")
|
520 |
+
|
521 |
+
# Test with a URL that might fail to fetch
|
522 |
+
print("\n--- Testing with a potentially failing URL ---")
|
523 |
+
# Use a non-existent subdomain or a port that's unlikely to be open
|
524 |
+
failing_url = "http://this-domain-does-not-exist-12345.com/"
|
525 |
+
result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
|
526 |
+
print(f"Result for failing URL: {result_fail.get('note')}")
|
527 |
+
if result_fail.get('fetch_result'):
|
528 |
+
print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
|
529 |
+
|
530 |
class EnhancedFileProcessor:
|
531 |
"""Advanced file processing with enhanced content extraction"""
|
532 |
def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
|