Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on 29 days ago

Commit

50921ef

verified ·

1 Parent(s): 59e9295

Update app2.py

Browse files

Files changed (1) hide show

app2.py +231 -39

app2.py CHANGED Viewed

@@ -88,13 +88,13 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
     directory.mkdir(parents=True, exist_ok=True)
 class EnhancedURLProcessor:
-    """Advanced URL processing with enhanced content extraction"""
     def __init__(self):
         self.session = requests.Session()
         self.timeout = 15  # Extended timeout for larger content
         self.max_retries = 3
         self.user_agent = UserAgent()
         # Enhanced headers for better site compatibility
         self.session.headers.update({
             'User-Agent': self.user_agent.random,
@@ -110,7 +110,7 @@ class EnhancedURLProcessor:
             'DNT': '1'
         })
-    def validate_url(self, url: str) -> Dict:
         """Enhanced URL validation with detailed feedback"""
         try:
             if not validators.url(url):
@@ -123,36 +123,47 @@ class EnhancedURLProcessor:
                 head_response = self.session.head(url, timeout=5)
                 head_response.raise_for_status()
                 final_url = head_response.url # Capture potential redirects
             except requests.exceptions.RequestException:
                  # If HEAD fails, try GET as some servers don't support HEAD
-                response = self.session.get(url, timeout=self.timeout)
-                response.raise_for_status()
-                final_url = response.url # Capture potential redirects
             return {
                 'is_valid': True,
                 'message': 'URL is valid and accessible',
                 'details': {
                     'final_url': final_url,
-                    'content_type': head_response.headers.get('Content-Type', 'unknown'),
-                    'server': head_response.headers.get('Server', 'unknown'),
-                    'size': head_response.headers.get('Content-Length', 'unknown')
                 }
             }
         except Exception as e:
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
-    def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
         """Enhanced content fetcher with retry mechanism and complete character extraction"""
         try:
             logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
             # Update User-Agent randomly for each request
             self.session.headers.update({'User-Agent': self.user_agent.random})
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             final_url = response.url # Capture potential redirects
             # Detect encoding
             if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
@@ -177,14 +188,13 @@ class EnhancedURLProcessor:
                      encoding = 'latin-1 (fallback)'
                      logger.warning(f"Decoding with {encoding} fallback for {url}")
             # Extract metadata
             metadata = {
                 'original_url': url,
                 'final_url': final_url,
                 'timestamp': datetime.now().isoformat(),
                 'detected_encoding': encoding,
-                'content_type': response.headers.get('Content-Type', ''),
                 'content_length': len(response.content),
                 'headers': dict(response.headers),
                 'status_code': response.status_code
@@ -195,7 +205,7 @@ class EnhancedURLProcessor:
             return {
                 'source': 'url',
-                'url': url, # Keep original URL as identifier
                 'raw_content': raw_content,
                 'metadata': metadata,
                 'extracted_data': processed_extraction['data'],
@@ -211,9 +221,9 @@ class EnhancedURLProcessor:
                  'source': 'url',
                  'url': url,
                  'raw_content': None,
-                 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
                  'extracted_data': None,
-                 'processing_notes': f"Failed to fetch content: {str(e)}"
             }
         except Exception as e:
             logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
@@ -221,9 +231,9 @@ class EnhancedURLProcessor:
                 'source': 'url',
                 'url': url,
                 'raw_content': raw_content if 'raw_content' in locals() else None,
-                'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
                 'extracted_data': None,
-                'processing_notes': f"Unexpected processing error: {str(e)}"
             }
     def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
@@ -231,7 +241,6 @@ class EnhancedURLProcessor:
         lower_content_type = content_type.lower()
         notes = []
         extracted_data: Any = None # Use Any to allow different types
         try:
             if 'text/html' in lower_content_type:
                 logger.debug(f"Processing HTML content from {base_url}")
@@ -253,10 +262,8 @@ class EnhancedURLProcessor:
             elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
                  logger.debug(f"Processing XML content from {base_url}")
                  try:
-                     # Try parsing XML. Convert to a string or a dict representation if needed.
-                     # For simplicity, we'll convert to a readable string representation of the tree.
                      root = ET.fromstring(content)
-                     # A simple way to represent XML as text
                      xml_text = ET.tostring(root, encoding='unicode', method='xml')
                      extracted_data = xml_text # Store as string for now
                      notes.append("Parsed as XML (text representation)")
@@ -276,17 +283,14 @@ class EnhancedURLProcessor:
                 logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
                 extracted_data = content # Store raw content for unknown types
                 notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
         except Exception as e:
             logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
             extracted_data = content # Fallback to raw content on error
             notes.append(f"Unexpected processing error: {e}. Stored raw text.")
         return {'data': extracted_data, 'notes': notes}
     def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
-        """Process HTML content, preserving text, and extracting metadata."""
         extracted: Dict[str, Any] = {
             'title': None,
             'meta_description': None, # Add extraction for meta description
@@ -306,23 +310,33 @@ class EnhancedURLProcessor:
                 extracted['meta_description'] = meta_desc['content'].strip()
             # Extract and process links (convert relative to absolute)
             for a_tag in soup.find_all('a', href=True):
-                 href = a_tag['href']
-                 text = a_tag.get_text().strip()
-                 try:
-                      absolute_url = urljoin(base_url, href)
-                      extracted['links'].append({'text': text, 'url': absolute_url})
-                 except Exception:
-                      extracted['links'].append({'text': text, 'url': href}) # Keep relative if join fails
             # Extract all text content (similar to stripped_strings but ensures order)
-            text_parts = []
             # Use a more robust way to get visible text, including handling script/style tags
-            for script_or_style in soup(["script", "style"]):
                 script_or_style.extract() # Remove script and style tags
-            text = soup.get_text(separator='\n') # Get text with newlines
             # Clean up whitespace and empty lines
             lines = text.splitlines()
             cleaned_lines = [line.strip() for line in lines if line.strip()]
@@ -330,11 +344,189 @@ class EnhancedURLProcessor:
         except Exception as e:
             logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
-            extracted['full_text'] = content # Fallback to raw content
             extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
         return extracted
 class EnhancedFileProcessor:
     """Advanced file processing with enhanced content extraction"""
     def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024):  # 5GB default

     directory.mkdir(parents=True, exist_ok=True)
 class EnhancedURLProcessor:
+    """Advanced URL processing with enhanced content extraction and recursive link following."""
     def __init__(self):
         self.session = requests.Session()
         self.timeout = 15  # Extended timeout for larger content
         self.max_retries = 3
         self.user_agent = UserAgent()
         # Enhanced headers for better site compatibility
         self.session.headers.update({
             'User-Agent': self.user_agent.random,
             'DNT': '1'
         })
+    def validate_url(self, url: str) -> Dict[str, Any]:
         """Enhanced URL validation with detailed feedback"""
         try:
             if not validators.url(url):
                 head_response = self.session.head(url, timeout=5)
                 head_response.raise_for_status()
                 final_url = head_response.url # Capture potential redirects
+                content_type = head_response.headers.get('Content-Type', 'unknown')
+                server = head_response.headers.get('Server', 'unknown')
+                size = head_response.headers.get('Content-Length', 'unknown')
             except requests.exceptions.RequestException:
                  # If HEAD fails, try GET as some servers don't support HEAD
+                 try:
+                    response = self.session.get(url, timeout=self.timeout)
+                    response.raise_for_status()
+                    final_url = response.url # Capture potential redirects
+                    content_type = response.headers.get('Content-Type', 'unknown')
+                    server = response.headers.get('Server', 'unknown')
+                    size = response.headers.get('Content-Length', 'unknown') # May not be accurate for full content
+                 except requests.exceptions.RequestException as get_e:
+                     return {'is_valid': False, 'message': f'URL not accessible after HEAD/GET attempts: {str(get_e)}', 'details': str(get_e)}
+                 except Exception as get_e:
+                     return {'is_valid': False, 'message': f'Unexpected error during GET validation: {str(get_e)}', 'details': str(get_e)}
             return {
                 'is_valid': True,
                 'message': 'URL is valid and accessible',
                 'details': {
                     'final_url': final_url,
+                    'content_type': content_type,
+                    'server': server,
+                    'size': size
                 }
             }
         except Exception as e:
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
+    def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict[str, Any]]:
         """Enhanced content fetcher with retry mechanism and complete character extraction"""
         try:
             logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
             # Update User-Agent randomly for each request
             self.session.headers.update({'User-Agent': self.user_agent.random})
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             final_url = response.url # Capture potential redirects
+            content_type = response.headers.get('Content-Type', '')
             # Detect encoding
             if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
                      encoding = 'latin-1 (fallback)'
                      logger.warning(f"Decoding with {encoding} fallback for {url}")
             # Extract metadata
             metadata = {
                 'original_url': url,
                 'final_url': final_url,
                 'timestamp': datetime.now().isoformat(),
                 'detected_encoding': encoding,
+                'content_type': content_type,
                 'content_length': len(response.content),
                 'headers': dict(response.headers),
                 'status_code': response.status_code
             return {
                 'source': 'url',
+                'url': url, # Keep original URL as identifier for this step
                 'raw_content': raw_content,
                 'metadata': metadata,
                 'extracted_data': processed_extraction['data'],
                  'source': 'url',
                  'url': url,
                  'raw_content': None,
+                 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None}, # Include basic metadata on failure
                  'extracted_data': None,
+                 'processing_notes': [f"Failed to fetch content after {self.max_retries} attempts: {str(e)}"] # Ensure notes is a list
             }
         except Exception as e:
             logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
                 'source': 'url',
                 'url': url,
                 'raw_content': raw_content if 'raw_content' in locals() else None,
+                'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat(), 'status_code': None},
                 'extracted_data': None,
+                'processing_notes': [f"Unexpected processing error: {str(e)}"]
             }
     def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
         lower_content_type = content_type.lower()
         notes = []
         extracted_data: Any = None # Use Any to allow different types
         try:
             if 'text/html' in lower_content_type:
                 logger.debug(f"Processing HTML content from {base_url}")
             elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
                  logger.debug(f"Processing XML content from {base_url}")
                  try:
+                     # Try parsing XML. Convert to a string representation.
                      root = ET.fromstring(content)
                      xml_text = ET.tostring(root, encoding='unicode', method='xml')
                      extracted_data = xml_text # Store as string for now
                      notes.append("Parsed as XML (text representation)")
                 logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
                 extracted_data = content # Store raw content for unknown types
                 notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
         except Exception as e:
             logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
             extracted_data = content # Fallback to raw content on error
             notes.append(f"Unexpected processing error: {e}. Stored raw text.")
         return {'data': extracted_data, 'notes': notes}
     def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
+        """Process HTML content, preserving text, and extracting metadata and links."""
         extracted: Dict[str, Any] = {
             'title': None,
             'meta_description': None, # Add extraction for meta description
                 extracted['meta_description'] = meta_desc['content'].strip()
             # Extract and process links (convert relative to absolute)
+            # Use a set to avoid duplicate URLs in the links list
+            unique_links = set()
             for a_tag in soup.find_all('a', href=True):
+                 href = a_tag['href'].strip()
+                 if href and not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): # Basic filter
+                    text = a_tag.get_text().strip()
+                    try:
+                         absolute_url = urljoin(base_url, href)
+                         if absolute_url not in unique_links:
+                             extracted['links'].append({'text': text, 'url': absolute_url})
+                             unique_links.add(absolute_url)
+                    except Exception:
+                         # If urljoin fails, keep the original href if it looks like a valid potential URL part
+                         if validators.url(href) and href not in unique_links:
+                             extracted['links'].append({'text': text, 'url': href})
+                             unique_links.add(href)
+                         elif urlparse(href).netloc and href not in unique_links: # Maybe just a domain/path?
+                              extracted['links'].append({'text': text, 'url': href})
+                              unique_links.add(href)
             # Extract all text content (similar to stripped_strings but ensures order)
             # Use a more robust way to get visible text, including handling script/style tags
+            soup_copy = BeautifulSoup(content, 'html.parser') # Work on a copy to preserve soup for links
+            for script_or_style in soup_copy(["script", "style"]):
                 script_or_style.extract() # Remove script and style tags
+            text = soup_copy.get_text(separator='\n') # Get text with newlines
             # Clean up whitespace and empty lines
             lines = text.splitlines()
             cleaned_lines = [line.strip() for line in lines if line.strip()]
         except Exception as e:
             logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
+            # Fallback: Store raw text and indicate error
+            soup_copy = BeautifulSoup(content, 'html.parser')
+            for script_or_style in soup_copy(["script", "style"]):
+                script_or_style.extract()
+            extracted['full_text'] = soup_copy.get_text(separator='\n').strip()
             extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
         return extracted
+    def fetch_content_with_depth(self, url: str, max_steps: int = 0) -> Dict[str, Any]:
+        """
+        Fetches content from a URL and recursively follows links up to a specified depth.
+        Args:
+            url: The initial URL to fetch.
+            max_steps: The maximum number of levels to follow links (0-3).
+                       0: Only fetch the initial URL.
+                       1: Fetch the initial URL and the links found on that page.
+                       2: Fetch the initial URL, its links, and the links on those pages.
+                       3: Fetch up to the third level of links.
+        Returns:
+            A dictionary containing the extraction result for the initial URL and
+            nested results for followed links.
+        """
+        if not isinstance(max_steps, int) or not (0 <= max_steps <= 3):
+            logger.error(f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3.")
+            return {
+                'url': url,
+                'level': 0,
+                'fetch_result': None,
+                'linked_extractions': [],
+                'note': f"Invalid max_steps value: {max_steps}. Must be an integer between 0 and 3."
+            }
+        validation_result = self.validate_url(url)
+        if not validation_result['is_valid']:
+             logger.error(f"Initial URL validation failed for {url}: {validation_result['message']}")
+             return {
+                'url': url,
+                'level': 0,
+                'fetch_result': None,
+                'linked_extractions': [],
+                'note': f"Initial URL validation failed: {validation_result['message']}"
+            }
+        return self._fetch_content_recursive(url, max_steps, current_step=0)
+    def _fetch_content_recursive(self, url: str, max_steps: int, current_step: int) -> Dict[str, Any]:
+        """Recursive helper to fetch content and follow links."""
+        if current_step > max_steps:
+            logger.debug(f"Depth limit reached for {url} at level {current_step}.")
+            return {
+                'url': url,
+                'level': current_step,
+                'fetch_result': None, # Indicate no fetch happened at this level
+                'linked_extractions': [],
+                'note': f"Depth limit ({max_steps}) reached."
+            }
+        logger.info(f"Processing URL: {url} at level {current_step}/{max_steps}")
+        # Fetch content for the current URL
+        fetch_result = self.fetch_content(url)
+        linked_extractions: List[Dict[str, Any]] = []
+        # Only follow links if fetch was successful, content is HTML, and within depth limit
+        if fetch_result and fetch_result.get('extracted_data') and 'text/html' in fetch_result.get('metadata', {}).get('content_type', '').lower():
+            extracted_data = fetch_result['extracted_data']
+            links = extracted_data.get('links', []) # Ensure links is a list even if missing
+            logger.info(f"Found {len(links)} links on {url} at level {current_step}. Proceeding to depth {current_step + 1}.")
+            # Recursively fetch linked content if not at max depth
+            if current_step < max_steps:
+                for link_info in links:
+                    linked_url = link_info.get('url')
+                    if linked_url:
+                        # Simple check to avoid re-fetching the same URL repeatedly in a chain
+                        # More sophisticated cycle detection might be needed for complex graphs
+                        if linked_url != urlparse(url)._replace(fragment='').geturl(): # Avoid self-referencing links ignoring fragment
+                             # Recursively call for the linked URL
+                             linked_result = self._fetch_content_recursive(linked_url, max_steps, current_step + 1)
+                             linked_extractions.append(linked_result)
+                        else:
+                            logger.debug(f"Skipping self-referencing link: {linked_url}")
+                            linked_extractions.append({
+                                'url': linked_url,
+                                'level': current_step + 1,
+                                'fetch_result': None,
+                                'linked_extractions': [],
+                                'note': 'Skipped self-referencing link'
+                            })
+                    else:
+                        linked_extractions.append({
+                             'url': 'Invalid or missing link',
+                             'level': current_step + 1,
+                             'fetch_result': None,
+                             'linked_extractions': [],
+                             'note': 'Link URL not found or invalid'
+                         })
+            else:
+                 logger.info(f"Max depth ({max_steps}) reached. Not following links from {url}.")
+        return {
+            'url': url,
+            'level': current_step,
+            'fetch_result': fetch_result,
+            'linked_extractions': linked_extractions,
+            'note': f"Processed at level {current_step}"
+        }
+# --- Example Usage ---
+if __name__ == "__main__":
+    processor = EnhancedURLProcessor()
+    # --- Test Cases ---
+    # Test with 0 steps (only initial URL)
+    print("\n--- Testing with max_steps = 0 ---")
+    result_0 = processor.fetch_content_with_depth("https://httpbin.org/html", max_steps=0)
+    # print(json.dumps(result_0, indent=2)) # Uncomment to see full structure
+    print(f"Initial URL ({result_0['url']}) fetched at level {result_0['level']}. Success: {result_0['fetch_result'] is not None}")
+    print(f"Number of linked extractions: {len(result_0['linked_extractions'])}") # Should be 0
+    # Test with 1 step (initial URL + its direct links)
+    # Note: Replace with a real website URL that has internal links for meaningful testing
+    # For demonstration, using a placeholder. A real site like a blog post or news article front page is better.
+    test_url_with_links = "https://quotes.toscrape.com/" # Example site with links
+    print(f"\n--- Testing with max_steps = 1 for {test_url_with_links} ---")
+    result_1 = processor.fetch_content_with_depth(test_url_with_links, max_steps=1)
+    # print(json.dumps(result_1, indent=2)) # Uncomment to see full structure
+    print(f"Initial URL ({result_1['url']}) fetched at level {result_1['level']}. Success: {result_1['fetch_result'] is not None}")
+    print(f"Number of direct links found and processed: {len(result_1['linked_extractions'])}")
+    if result_1['linked_extractions']:
+         print(f"First linked URL processed at level 1: {result_1['linked_extractions'][0]['url']}")
+         print(f"Number of links found on the first linked page: {len(result_1['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=1
+    # Test with 2 steps
+    print(f"\n--- Testing with max_steps = 2 for {test_url_with_links} ---")
+    result_2 = processor.fetch_content_with_depth(test_url_with_links, max_steps=2)
+    # print(json.dumps(result_2, indent=2)) # Uncomment to see full structure
+    print(f"Initial URL ({result_2['url']}) fetched at level {result_2['level']}. Success: {result_2['fetch_result'] is not None}")
+    print(f"Number of direct links found and processed (Level 1): {len(result_2['linked_extractions'])}")
+    if result_2['linked_extractions']:
+        print(f"First linked URL processed at level 1: {result_2['linked_extractions'][0]['url']}")
+        print(f"Number of links found on the first linked page and processed (Level 2): {len(result_2['linked_extractions'][0]['linked_extractions'])}")
+        if result_2['linked_extractions'][0]['linked_extractions']:
+             print(f"First level 2 linked URL: {result_2['linked_extractions'][0]['linked_extractions'][0]['url']}")
+             print(f"Number of links found on the first level 2 page: {len(result_2['linked_extractions'][0]['linked_extractions'][0]['linked_extractions'])}") # Should be 0 for max_steps=2
+    # Test with max_steps = 3 (will go one level deeper than 2)
+    # print(f"\n--- Testing with max_steps = 3 for {test_url_with_links} ---")
+    # result_3 = processor.fetch_content_with_depth(test_url_with_links, max_steps=3)
+    # print(json.dumps(result_3, indent=2)) # Uncomment to see full structure
+    # Add similar print statements for result_3 to show levels 1, 2, and 3 counts
+    # Test with invalid max_steps
+    print("\n--- Testing with invalid max_steps = 4 ---")
+    result_invalid = processor.fetch_content_with_depth("https://example.com", max_steps=4)
+    print(f"Result for invalid steps: {result_invalid.get('note')}")
+    # Test with invalid initial URL
+    print("\n--- Testing with invalid initial URL ---")
+    result_invalid_url = processor.fetch_content_with_depth("invalid-url", max_steps=1)
+    print(f"Result for invalid initial URL: {result_invalid_url.get('note')}")
+    # Test with a URL that might fail to fetch
+    print("\n--- Testing with a potentially failing URL ---")
+    # Use a non-existent subdomain or a port that's unlikely to be open
+    failing_url = "http://this-domain-does-not-exist-12345.com/"
+    result_fail = processor.fetch_content_with_depth(failing_url, max_steps=1)
+    print(f"Result for failing URL: {result_fail.get('note')}")
+    if result_fail.get('fetch_result'):
+        print(f"Fetch result notes for failing URL: {result_fail['fetch_result'].get('processing_notes')}")
 class EnhancedFileProcessor:
     """Advanced file processing with enhanced content extraction"""
     def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024):  # 5GB default