HF_Final_Assignment_Template

Sleeping

App Files Files Community

Samuel Thomas commited on Jun 1

Commit

d5fffa5

1 Parent(s): 9e30ca3

ddgo debug

Browse files

Files changed (1) hide show

tools.py +34 -72

tools.py CHANGED Viewed

@@ -785,6 +785,7 @@ class PythonExecutorTool(BaseTool):
         """Async version - delegates to sync implementation."""
         return self._run(file_path, run_manager)
 class EnhancedDuckDuckGoSearchTool(BaseTool):
     name: str = "enhanced_search"
     description: str = (
@@ -796,13 +797,10 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
     )
     max_results: int = 3
     max_chars_per_page: int = 12000
-    session: Any = None  # Now it's optional and defaults to None
-    # Use model_post_init for initialization logic in Pydantic v2+
     def model_post_init(self, __context: Any) -> None:
         super().model_post_init(__context)
-        # Initialize HTTP session here
         self.session = requests.Session()
         self.session.headers.update({
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -812,100 +810,70 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1',
         })
-    def _search_duckduckgo(self, query: str) -> List[Dict]:
         """Perform DuckDuckGo search and return results."""
         try:
             with DDGS() as ddgs:
-                results = list(ddgs.text(query, max_results=self.max_results))
                 return results
         except Exception as e:
             logger.error(f"DuckDuckGo search failed: {e}")
             return []
     def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
         """Extract clean text content from a web page."""
         try:
-            # Skip certain file types
             if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
                 return "Content type not supported for extraction"
             response = self.session.get(url, timeout=timeout, allow_redirects=True)
             response.raise_for_status()
-            # Check content type
             content_type = response.headers.get('content-type', '').lower()
             if 'text/html' not in content_type:
                 return "Non-HTML content detected"
             soup = BeautifulSoup(response.content, 'html.parser')
-            # Remove script and style elements
-            for script in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
-                script.decompose()
-            # Try to find main content areas
             main_content = None
-            for selector in ['main', 'article', '.content', '#content', '.post', '.entry']:
                 main_content = soup.select_one(selector)
                 if main_content:
                     break
             if not main_content:
                 main_content = soup.find('body') or soup
-            # Extract text
             text = main_content.get_text(separator='\n', strip=True)
-            # Clean up the text
             lines = [line.strip() for line in text.split('\n') if line.strip()]
             text = '\n'.join(lines)
-            # Remove excessive whitespace
             text = re.sub(r'\n{3,}', '\n\n', text)
             text = re.sub(r' {2,}', ' ', text)
-            # Truncate if too long
             if len(text) > self.max_chars_per_page:
                 text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
             return text
         except requests.exceptions.Timeout:
             return "Page loading timed out"
         except requests.exceptions.RequestException as e:
             return f"Failed to retrieve page: {str(e)}"
         except Exception as e:
             logger.error(f"Content extraction failed for {url}: {e}")
             return "Failed to extract content from page"
     def _format_search_result(self, result: Dict, content: str) -> str:
         """Format a single search result with its content."""
         title = result.get('title', 'No title')
         url = result.get('href', 'No URL')
         snippet = result.get('body', 'No snippet')
-        formatted = f"""
-🔍 **{title}**
-URL: {url}
-Snippet: {snippet}
-📄 **Page Content:**
-{content}
----
-"""
-        return formatted
     def run(self, tool_input: Union[str, Dict]) -> str:
         query_str: Optional[str] = None
         if isinstance(tool_input, dict):
-            # Try common keys where the actual query string might be stored
             if "query" in tool_input and isinstance(tool_input["query"], str):
                 query_str = tool_input["query"]
             elif "input" in tool_input and isinstance(tool_input["input"], str):
                 query_str = tool_input["input"]
-            # Add more checks if other dictionary structures are possible
             else:
                 return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
         elif isinstance(tool_input, str):
@@ -913,20 +881,20 @@ Snippet: {snippet}
         else:
             return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
-        """Execute the enhanced search."""
-        if not query or not query.strip():
             return "Please provide a search query."
-        query = query.strip()
-        logger.info(f"Searching for: {query}")
-        # Perform DuckDuckGo search
-        search_results = self._search_duckduckgo(query)
         if not search_results:
-            return f"No search results found for query: {query}"
-        # Process each result and extract content
         enhanced_results = []
         processed_count = 0
@@ -934,42 +902,36 @@ Snippet: {snippet}
             url = result.get('href', '')
             if not url:
                 continue
             logger.info(f"Processing result {i+1}: {url}")
-            # Extract content from the page
             content = self._extract_content_from_url(url)
-            if content and len(content.strip()) > 50:  # Only include results with substantial content
                 formatted_result = self._format_search_result(result, content)
                 enhanced_results.append(formatted_result)
                 processed_count += 1
-            # Small delay to be respectful to servers
-            time.sleep(0.5)
         if not enhanced_results:
-            return f"Search completed but no content could be extracted from the pages for query: {query}"
-        # Compile final response
-        response = f"""🔍 **Enhanced Search Results for: "{query}"**
 Found {len(search_results)} results, successfully processed {processed_count} pages with content.
 {''.join(enhanced_results)}
 💡 **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
-"""
-        # Ensure the response isn't too long
-        if len(response) > 12000:
             response = response[:12000] + "\n[Response truncated to prevent memory issues]"
         return response
-    def _run(self, query: str) -> str:
-        """Required by BaseTool interface."""
-        return self.run(query)
 # --- Agent State Definition ---
 class AgentState(TypedDict):
     messages: Annotated[List[AnyMessage], lambda x, y: x + y]

         """Async version - delegates to sync implementation."""
         return self._run(file_path, run_manager)
 class EnhancedDuckDuckGoSearchTool(BaseTool):
     name: str = "enhanced_search"
     description: str = (
     )
     max_results: int = 3
     max_chars_per_page: int = 12000
+    session: Any = None
     def model_post_init(self, __context: Any) -> None:
         super().model_post_init(__context)
         self.session = requests.Session()
         self.session.headers.update({
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1',
         })
+    def _search_duckduckgo(self, query_term: str) -> List[Dict]: # Renamed 'query' to 'query_term' for clarity
         """Perform DuckDuckGo search and return results."""
         try:
             with DDGS() as ddgs:
+                results = list(ddgs.text(query_term, max_results=self.max_results))
                 return results
         except Exception as e:
             logger.error(f"DuckDuckGo search failed: {e}")
             return []
     def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
         """Extract clean text content from a web page."""
         try:
             if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
                 return "Content type not supported for extraction"
             response = self.session.get(url, timeout=timeout, allow_redirects=True)
             response.raise_for_status()
             content_type = response.headers.get('content-type', '').lower()
             if 'text/html' not in content_type:
                 return "Non-HTML content detected"
             soup = BeautifulSoup(response.content, 'html.parser')
+            for script_or_style in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
+                script_or_style.decompose()
             main_content = None
+            for selector in ['main', 'article', '.content', '#content', '.post', '.entry-content', '.entry']: # Added .entry-content
                 main_content = soup.select_one(selector)
                 if main_content:
                     break
             if not main_content:
                 main_content = soup.find('body') or soup
             text = main_content.get_text(separator='\n', strip=True)
             lines = [line.strip() for line in text.split('\n') if line.strip()]
             text = '\n'.join(lines)
             text = re.sub(r'\n{3,}', '\n\n', text)
             text = re.sub(r' {2,}', ' ', text)
             if len(text) > self.max_chars_per_page:
                 text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
             return text
         except requests.exceptions.Timeout:
+            logger.warning(f"Page loading timed out for {url}")
             return "Page loading timed out"
         except requests.exceptions.RequestException as e:
+            logger.warning(f"Failed to retrieve page {url}: {str(e)}")
             return f"Failed to retrieve page: {str(e)}"
         except Exception as e:
             logger.error(f"Content extraction failed for {url}: {e}")
             return "Failed to extract content from page"
     def _format_search_result(self, result: Dict, content: str) -> str:
         """Format a single search result with its content."""
         title = result.get('title', 'No title')
         url = result.get('href', 'No URL')
         snippet = result.get('body', 'No snippet')
+        return f"🔍 **{title}**\nURL: {url}\nSnippet: {snippet}\n\n📄 **Page Content:**\n{content}\n---\n"
     def run(self, tool_input: Union[str, Dict]) -> str:
         query_str: Optional[str] = None
         if isinstance(tool_input, dict):
             if "query" in tool_input and isinstance(tool_input["query"], str):
                 query_str = tool_input["query"]
             elif "input" in tool_input and isinstance(tool_input["input"], str):
                 query_str = tool_input["input"]
             else:
                 return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
         elif isinstance(tool_input, str):
         else:
             return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
+        # The misplaced docstring """Execute the enhanced search.""" was removed from here.
+        # Use query_str consistently from now on
+        if not query_str or not query_str.strip():
             return "Please provide a search query."
+        query_str = query_str.strip() # Apply strip to query_str
+        logger.info(f"Searching for: {query_str}") # Use query_str
+        search_results = self._search_duckduckgo(query_str) # Use query_str
         if not search_results:
+            return f"No search results found for query: {query_str}" # Use query_str
         enhanced_results = []
         processed_count = 0
             url = result.get('href', '')
             if not url:
                 continue
             logger.info(f"Processing result {i+1}: {url}")
             content = self._extract_content_from_url(url)
+            if content and len(content.strip()) > 50:
                 formatted_result = self._format_search_result(result, content)
                 enhanced_results.append(formatted_result)
                 processed_count += 1
+            time.sleep(0.5) # Consider making this configurable or adjusting based on use case
         if not enhanced_results:
+            return f"Search completed but no content could be extracted from the pages for query: {query_str}" # Use query_str
+        response = f"""🔍 **Enhanced Search Results for: "{query_str}"**
 Found {len(search_results)} results, successfully processed {processed_count} pages with content.
 {''.join(enhanced_results)}
 💡 **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
+""" # Use query_str
+        if len(response) > 12000: # This limit is arbitrary; consider if it should relate to self.max_chars_per_page
             response = response[:12000] + "\n[Response truncated to prevent memory issues]"
         return response
+    def _run(self, query_or_tool_input: Union[str, Dict]) -> str: # Updated to reflect run's input
+        """Required by BaseTool interface. Handles various input types."""
+        # This _run method now correctly passes the input to the run method,
+        # which is designed to handle both string and dictionary inputs.
+        return self.run(query_or_tool_input)
 # --- Agent State Definition ---
 class AgentState(TypedDict):
     messages: Annotated[List[AnyMessage], lambda x, y: x + y]