Samuel Thomas commited on
Commit
d5fffa5
Β·
1 Parent(s): 9e30ca3

ddgo debug

Browse files
Files changed (1) hide show
  1. tools.py +34 -72
tools.py CHANGED
@@ -785,6 +785,7 @@ class PythonExecutorTool(BaseTool):
785
  """Async version - delegates to sync implementation."""
786
  return self._run(file_path, run_manager)
787
 
 
788
  class EnhancedDuckDuckGoSearchTool(BaseTool):
789
  name: str = "enhanced_search"
790
  description: str = (
@@ -796,13 +797,10 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
796
  )
797
  max_results: int = 3
798
  max_chars_per_page: int = 12000
799
- session: Any = None # Now it's optional and defaults to None
800
-
801
 
802
- # Use model_post_init for initialization logic in Pydantic v2+
803
  def model_post_init(self, __context: Any) -> None:
804
  super().model_post_init(__context)
805
- # Initialize HTTP session here
806
  self.session = requests.Session()
807
  self.session.headers.update({
808
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -812,100 +810,70 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
812
  'Connection': 'keep-alive',
813
  'Upgrade-Insecure-Requests': '1',
814
  })
815
-
816
- def _search_duckduckgo(self, query: str) -> List[Dict]:
817
  """Perform DuckDuckGo search and return results."""
818
  try:
819
  with DDGS() as ddgs:
820
- results = list(ddgs.text(query, max_results=self.max_results))
821
  return results
822
  except Exception as e:
823
  logger.error(f"DuckDuckGo search failed: {e}")
824
  return []
825
-
826
  def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
827
  """Extract clean text content from a web page."""
828
  try:
829
- # Skip certain file types
830
  if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
831
  return "Content type not supported for extraction"
832
-
833
  response = self.session.get(url, timeout=timeout, allow_redirects=True)
834
  response.raise_for_status()
835
-
836
- # Check content type
837
  content_type = response.headers.get('content-type', '').lower()
838
  if 'text/html' not in content_type:
839
  return "Non-HTML content detected"
840
-
841
  soup = BeautifulSoup(response.content, 'html.parser')
842
-
843
- # Remove script and style elements
844
- for script in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
845
- script.decompose()
846
-
847
- # Try to find main content areas
848
  main_content = None
849
- for selector in ['main', 'article', '.content', '#content', '.post', '.entry']:
850
  main_content = soup.select_one(selector)
851
  if main_content:
852
  break
853
-
854
  if not main_content:
855
  main_content = soup.find('body') or soup
856
-
857
- # Extract text
858
  text = main_content.get_text(separator='\n', strip=True)
859
-
860
- # Clean up the text
861
  lines = [line.strip() for line in text.split('\n') if line.strip()]
862
  text = '\n'.join(lines)
863
-
864
- # Remove excessive whitespace
865
  text = re.sub(r'\n{3,}', '\n\n', text)
866
  text = re.sub(r' {2,}', ' ', text)
867
-
868
- # Truncate if too long
869
  if len(text) > self.max_chars_per_page:
870
  text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
871
-
872
  return text
873
-
874
  except requests.exceptions.Timeout:
 
875
  return "Page loading timed out"
876
  except requests.exceptions.RequestException as e:
 
877
  return f"Failed to retrieve page: {str(e)}"
878
  except Exception as e:
879
  logger.error(f"Content extraction failed for {url}: {e}")
880
  return "Failed to extract content from page"
881
-
882
  def _format_search_result(self, result: Dict, content: str) -> str:
883
  """Format a single search result with its content."""
884
  title = result.get('title', 'No title')
885
  url = result.get('href', 'No URL')
886
  snippet = result.get('body', 'No snippet')
887
-
888
- formatted = f"""
889
- πŸ” **{title}**
890
- URL: {url}
891
- Snippet: {snippet}
892
-
893
- πŸ“„ **Page Content:**
894
- {content}
895
- ---
896
- """
897
- return formatted
898
-
899
  def run(self, tool_input: Union[str, Dict]) -> str:
900
  query_str: Optional[str] = None
901
 
902
  if isinstance(tool_input, dict):
903
- # Try common keys where the actual query string might be stored
904
  if "query" in tool_input and isinstance(tool_input["query"], str):
905
  query_str = tool_input["query"]
906
  elif "input" in tool_input and isinstance(tool_input["input"], str):
907
  query_str = tool_input["input"]
908
- # Add more checks if other dictionary structures are possible
909
  else:
910
  return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
911
  elif isinstance(tool_input, str):
@@ -913,20 +881,20 @@ Snippet: {snippet}
913
  else:
914
  return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
915
 
916
- """Execute the enhanced search."""
917
- if not query or not query.strip():
 
 
918
  return "Please provide a search query."
919
 
920
- query = query.strip()
921
- logger.info(f"Searching for: {query}")
922
 
923
- # Perform DuckDuckGo search
924
- search_results = self._search_duckduckgo(query)
925
 
926
  if not search_results:
927
- return f"No search results found for query: {query}"
928
 
929
- # Process each result and extract content
930
  enhanced_results = []
931
  processed_count = 0
932
 
@@ -934,42 +902,36 @@ Snippet: {snippet}
934
  url = result.get('href', '')
935
  if not url:
936
  continue
937
-
938
  logger.info(f"Processing result {i+1}: {url}")
939
-
940
- # Extract content from the page
941
  content = self._extract_content_from_url(url)
942
-
943
- if content and len(content.strip()) > 50: # Only include results with substantial content
944
  formatted_result = self._format_search_result(result, content)
945
  enhanced_results.append(formatted_result)
946
  processed_count += 1
947
-
948
- # Small delay to be respectful to servers
949
- time.sleep(0.5)
950
 
951
  if not enhanced_results:
952
- return f"Search completed but no content could be extracted from the pages for query: {query}"
953
 
954
- # Compile final response
955
- response = f"""πŸ” **Enhanced Search Results for: "{query}"**
956
  Found {len(search_results)} results, successfully processed {processed_count} pages with content.
957
 
958
  {''.join(enhanced_results)}
959
 
960
  πŸ’‘ **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
961
- """
962
 
963
- # Ensure the response isn't too long
964
- if len(response) > 12000:
965
  response = response[:12000] + "\n[Response truncated to prevent memory issues]"
966
 
967
  return response
968
 
969
- def _run(self, query: str) -> str:
970
- """Required by BaseTool interface."""
971
- return self.run(query)
972
-
 
 
973
  # --- Agent State Definition ---
974
  class AgentState(TypedDict):
975
  messages: Annotated[List[AnyMessage], lambda x, y: x + y]
 
785
  """Async version - delegates to sync implementation."""
786
  return self._run(file_path, run_manager)
787
 
788
+
789
  class EnhancedDuckDuckGoSearchTool(BaseTool):
790
  name: str = "enhanced_search"
791
  description: str = (
 
797
  )
798
  max_results: int = 3
799
  max_chars_per_page: int = 12000
800
+ session: Any = None
 
801
 
 
802
  def model_post_init(self, __context: Any) -> None:
803
  super().model_post_init(__context)
 
804
  self.session = requests.Session()
805
  self.session.headers.update({
806
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 
810
  'Connection': 'keep-alive',
811
  'Upgrade-Insecure-Requests': '1',
812
  })
813
+
814
+ def _search_duckduckgo(self, query_term: str) -> List[Dict]: # Renamed 'query' to 'query_term' for clarity
815
  """Perform DuckDuckGo search and return results."""
816
  try:
817
  with DDGS() as ddgs:
818
+ results = list(ddgs.text(query_term, max_results=self.max_results))
819
  return results
820
  except Exception as e:
821
  logger.error(f"DuckDuckGo search failed: {e}")
822
  return []
823
+
824
  def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
825
  """Extract clean text content from a web page."""
826
  try:
 
827
  if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
828
  return "Content type not supported for extraction"
 
829
  response = self.session.get(url, timeout=timeout, allow_redirects=True)
830
  response.raise_for_status()
 
 
831
  content_type = response.headers.get('content-type', '').lower()
832
  if 'text/html' not in content_type:
833
  return "Non-HTML content detected"
 
834
  soup = BeautifulSoup(response.content, 'html.parser')
835
+ for script_or_style in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
836
+ script_or_style.decompose()
 
 
 
 
837
  main_content = None
838
+ for selector in ['main', 'article', '.content', '#content', '.post', '.entry-content', '.entry']: # Added .entry-content
839
  main_content = soup.select_one(selector)
840
  if main_content:
841
  break
 
842
  if not main_content:
843
  main_content = soup.find('body') or soup
 
 
844
  text = main_content.get_text(separator='\n', strip=True)
 
 
845
  lines = [line.strip() for line in text.split('\n') if line.strip()]
846
  text = '\n'.join(lines)
 
 
847
  text = re.sub(r'\n{3,}', '\n\n', text)
848
  text = re.sub(r' {2,}', ' ', text)
 
 
849
  if len(text) > self.max_chars_per_page:
850
  text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
 
851
  return text
 
852
  except requests.exceptions.Timeout:
853
+ logger.warning(f"Page loading timed out for {url}")
854
  return "Page loading timed out"
855
  except requests.exceptions.RequestException as e:
856
+ logger.warning(f"Failed to retrieve page {url}: {str(e)}")
857
  return f"Failed to retrieve page: {str(e)}"
858
  except Exception as e:
859
  logger.error(f"Content extraction failed for {url}: {e}")
860
  return "Failed to extract content from page"
861
+
862
  def _format_search_result(self, result: Dict, content: str) -> str:
863
  """Format a single search result with its content."""
864
  title = result.get('title', 'No title')
865
  url = result.get('href', 'No URL')
866
  snippet = result.get('body', 'No snippet')
867
+ return f"πŸ” **{title}**\nURL: {url}\nSnippet: {snippet}\n\nπŸ“„ **Page Content:**\n{content}\n---\n"
868
+
 
 
 
 
 
 
 
 
 
 
869
  def run(self, tool_input: Union[str, Dict]) -> str:
870
  query_str: Optional[str] = None
871
 
872
  if isinstance(tool_input, dict):
 
873
  if "query" in tool_input and isinstance(tool_input["query"], str):
874
  query_str = tool_input["query"]
875
  elif "input" in tool_input and isinstance(tool_input["input"], str):
876
  query_str = tool_input["input"]
 
877
  else:
878
  return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
879
  elif isinstance(tool_input, str):
 
881
  else:
882
  return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
883
 
884
+ # The misplaced docstring """Execute the enhanced search.""" was removed from here.
885
+
886
+ # Use query_str consistently from now on
887
+ if not query_str or not query_str.strip():
888
  return "Please provide a search query."
889
 
890
+ query_str = query_str.strip() # Apply strip to query_str
891
+ logger.info(f"Searching for: {query_str}") # Use query_str
892
 
893
+ search_results = self._search_duckduckgo(query_str) # Use query_str
 
894
 
895
  if not search_results:
896
+ return f"No search results found for query: {query_str}" # Use query_str
897
 
 
898
  enhanced_results = []
899
  processed_count = 0
900
 
 
902
  url = result.get('href', '')
903
  if not url:
904
  continue
 
905
  logger.info(f"Processing result {i+1}: {url}")
 
 
906
  content = self._extract_content_from_url(url)
907
+ if content and len(content.strip()) > 50:
 
908
  formatted_result = self._format_search_result(result, content)
909
  enhanced_results.append(formatted_result)
910
  processed_count += 1
911
+ time.sleep(0.5) # Consider making this configurable or adjusting based on use case
 
 
912
 
913
  if not enhanced_results:
914
+ return f"Search completed but no content could be extracted from the pages for query: {query_str}" # Use query_str
915
 
916
+ response = f"""πŸ” **Enhanced Search Results for: "{query_str}"**
 
917
  Found {len(search_results)} results, successfully processed {processed_count} pages with content.
918
 
919
  {''.join(enhanced_results)}
920
 
921
  πŸ’‘ **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
922
+ """ # Use query_str
923
 
924
+ if len(response) > 12000: # This limit is arbitrary; consider if it should relate to self.max_chars_per_page
 
925
  response = response[:12000] + "\n[Response truncated to prevent memory issues]"
926
 
927
  return response
928
 
929
+ def _run(self, query_or_tool_input: Union[str, Dict]) -> str: # Updated to reflect run's input
930
+ """Required by BaseTool interface. Handles various input types."""
931
+ # This _run method now correctly passes the input to the run method,
932
+ # which is designed to handle both string and dictionary inputs.
933
+ return self.run(query_or_tool_input)
934
+
935
  # --- Agent State Definition ---
936
  class AgentState(TypedDict):
937
  messages: Annotated[List[AnyMessage], lambda x, y: x + y]