Spaces:
Sleeping
Sleeping
Samuel Thomas
commited on
Commit
Β·
d5fffa5
1
Parent(s):
9e30ca3
ddgo debug
Browse files
tools.py
CHANGED
@@ -785,6 +785,7 @@ class PythonExecutorTool(BaseTool):
|
|
785 |
"""Async version - delegates to sync implementation."""
|
786 |
return self._run(file_path, run_manager)
|
787 |
|
|
|
788 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
789 |
name: str = "enhanced_search"
|
790 |
description: str = (
|
@@ -796,13 +797,10 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
|
|
796 |
)
|
797 |
max_results: int = 3
|
798 |
max_chars_per_page: int = 12000
|
799 |
-
session: Any = None
|
800 |
-
|
801 |
|
802 |
-
# Use model_post_init for initialization logic in Pydantic v2+
|
803 |
def model_post_init(self, __context: Any) -> None:
|
804 |
super().model_post_init(__context)
|
805 |
-
# Initialize HTTP session here
|
806 |
self.session = requests.Session()
|
807 |
self.session.headers.update({
|
808 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
@@ -812,100 +810,70 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
|
|
812 |
'Connection': 'keep-alive',
|
813 |
'Upgrade-Insecure-Requests': '1',
|
814 |
})
|
815 |
-
|
816 |
-
def _search_duckduckgo(self,
|
817 |
"""Perform DuckDuckGo search and return results."""
|
818 |
try:
|
819 |
with DDGS() as ddgs:
|
820 |
-
results = list(ddgs.text(
|
821 |
return results
|
822 |
except Exception as e:
|
823 |
logger.error(f"DuckDuckGo search failed: {e}")
|
824 |
return []
|
825 |
-
|
826 |
def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
|
827 |
"""Extract clean text content from a web page."""
|
828 |
try:
|
829 |
-
# Skip certain file types
|
830 |
if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
|
831 |
return "Content type not supported for extraction"
|
832 |
-
|
833 |
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
834 |
response.raise_for_status()
|
835 |
-
|
836 |
-
# Check content type
|
837 |
content_type = response.headers.get('content-type', '').lower()
|
838 |
if 'text/html' not in content_type:
|
839 |
return "Non-HTML content detected"
|
840 |
-
|
841 |
soup = BeautifulSoup(response.content, 'html.parser')
|
842 |
-
|
843 |
-
|
844 |
-
for script in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
|
845 |
-
script.decompose()
|
846 |
-
|
847 |
-
# Try to find main content areas
|
848 |
main_content = None
|
849 |
-
for selector in ['main', 'article', '.content', '#content', '.post', '.entry']:
|
850 |
main_content = soup.select_one(selector)
|
851 |
if main_content:
|
852 |
break
|
853 |
-
|
854 |
if not main_content:
|
855 |
main_content = soup.find('body') or soup
|
856 |
-
|
857 |
-
# Extract text
|
858 |
text = main_content.get_text(separator='\n', strip=True)
|
859 |
-
|
860 |
-
# Clean up the text
|
861 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
862 |
text = '\n'.join(lines)
|
863 |
-
|
864 |
-
# Remove excessive whitespace
|
865 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
866 |
text = re.sub(r' {2,}', ' ', text)
|
867 |
-
|
868 |
-
# Truncate if too long
|
869 |
if len(text) > self.max_chars_per_page:
|
870 |
text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
|
871 |
-
|
872 |
return text
|
873 |
-
|
874 |
except requests.exceptions.Timeout:
|
|
|
875 |
return "Page loading timed out"
|
876 |
except requests.exceptions.RequestException as e:
|
|
|
877 |
return f"Failed to retrieve page: {str(e)}"
|
878 |
except Exception as e:
|
879 |
logger.error(f"Content extraction failed for {url}: {e}")
|
880 |
return "Failed to extract content from page"
|
881 |
-
|
882 |
def _format_search_result(self, result: Dict, content: str) -> str:
|
883 |
"""Format a single search result with its content."""
|
884 |
title = result.get('title', 'No title')
|
885 |
url = result.get('href', 'No URL')
|
886 |
snippet = result.get('body', 'No snippet')
|
887 |
-
|
888 |
-
|
889 |
-
π **{title}**
|
890 |
-
URL: {url}
|
891 |
-
Snippet: {snippet}
|
892 |
-
|
893 |
-
π **Page Content:**
|
894 |
-
{content}
|
895 |
-
---
|
896 |
-
"""
|
897 |
-
return formatted
|
898 |
-
|
899 |
def run(self, tool_input: Union[str, Dict]) -> str:
|
900 |
query_str: Optional[str] = None
|
901 |
|
902 |
if isinstance(tool_input, dict):
|
903 |
-
# Try common keys where the actual query string might be stored
|
904 |
if "query" in tool_input and isinstance(tool_input["query"], str):
|
905 |
query_str = tool_input["query"]
|
906 |
elif "input" in tool_input and isinstance(tool_input["input"], str):
|
907 |
query_str = tool_input["input"]
|
908 |
-
# Add more checks if other dictionary structures are possible
|
909 |
else:
|
910 |
return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
|
911 |
elif isinstance(tool_input, str):
|
@@ -913,20 +881,20 @@ Snippet: {snippet}
|
|
913 |
else:
|
914 |
return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
|
915 |
|
916 |
-
"""Execute the enhanced search."""
|
917 |
-
|
|
|
|
|
918 |
return "Please provide a search query."
|
919 |
|
920 |
-
|
921 |
-
logger.info(f"Searching for: {
|
922 |
|
923 |
-
#
|
924 |
-
search_results = self._search_duckduckgo(query)
|
925 |
|
926 |
if not search_results:
|
927 |
-
return f"No search results found for query: {
|
928 |
|
929 |
-
# Process each result and extract content
|
930 |
enhanced_results = []
|
931 |
processed_count = 0
|
932 |
|
@@ -934,42 +902,36 @@ Snippet: {snippet}
|
|
934 |
url = result.get('href', '')
|
935 |
if not url:
|
936 |
continue
|
937 |
-
|
938 |
logger.info(f"Processing result {i+1}: {url}")
|
939 |
-
|
940 |
-
# Extract content from the page
|
941 |
content = self._extract_content_from_url(url)
|
942 |
-
|
943 |
-
if content and len(content.strip()) > 50: # Only include results with substantial content
|
944 |
formatted_result = self._format_search_result(result, content)
|
945 |
enhanced_results.append(formatted_result)
|
946 |
processed_count += 1
|
947 |
-
|
948 |
-
# Small delay to be respectful to servers
|
949 |
-
time.sleep(0.5)
|
950 |
|
951 |
if not enhanced_results:
|
952 |
-
return f"Search completed but no content could be extracted from the pages for query: {
|
953 |
|
954 |
-
|
955 |
-
response = f"""π **Enhanced Search Results for: "{query}"**
|
956 |
Found {len(search_results)} results, successfully processed {processed_count} pages with content.
|
957 |
|
958 |
{''.join(enhanced_results)}
|
959 |
|
960 |
π‘ **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
|
961 |
-
"""
|
962 |
|
963 |
-
#
|
964 |
-
if len(response) > 12000:
|
965 |
response = response[:12000] + "\n[Response truncated to prevent memory issues]"
|
966 |
|
967 |
return response
|
968 |
|
969 |
-
def _run(self,
|
970 |
-
"""Required by BaseTool interface."""
|
971 |
-
|
972 |
-
|
|
|
|
|
973 |
# --- Agent State Definition ---
|
974 |
class AgentState(TypedDict):
|
975 |
messages: Annotated[List[AnyMessage], lambda x, y: x + y]
|
|
|
785 |
"""Async version - delegates to sync implementation."""
|
786 |
return self._run(file_path, run_manager)
|
787 |
|
788 |
+
|
789 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
790 |
name: str = "enhanced_search"
|
791 |
description: str = (
|
|
|
797 |
)
|
798 |
max_results: int = 3
|
799 |
max_chars_per_page: int = 12000
|
800 |
+
session: Any = None
|
|
|
801 |
|
|
|
802 |
def model_post_init(self, __context: Any) -> None:
|
803 |
super().model_post_init(__context)
|
|
|
804 |
self.session = requests.Session()
|
805 |
self.session.headers.update({
|
806 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
|
810 |
'Connection': 'keep-alive',
|
811 |
'Upgrade-Insecure-Requests': '1',
|
812 |
})
|
813 |
+
|
814 |
+
def _search_duckduckgo(self, query_term: str) -> List[Dict]: # Renamed 'query' to 'query_term' for clarity
|
815 |
"""Perform DuckDuckGo search and return results."""
|
816 |
try:
|
817 |
with DDGS() as ddgs:
|
818 |
+
results = list(ddgs.text(query_term, max_results=self.max_results))
|
819 |
return results
|
820 |
except Exception as e:
|
821 |
logger.error(f"DuckDuckGo search failed: {e}")
|
822 |
return []
|
823 |
+
|
824 |
def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
|
825 |
"""Extract clean text content from a web page."""
|
826 |
try:
|
|
|
827 |
if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
|
828 |
return "Content type not supported for extraction"
|
|
|
829 |
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
830 |
response.raise_for_status()
|
|
|
|
|
831 |
content_type = response.headers.get('content-type', '').lower()
|
832 |
if 'text/html' not in content_type:
|
833 |
return "Non-HTML content detected"
|
|
|
834 |
soup = BeautifulSoup(response.content, 'html.parser')
|
835 |
+
for script_or_style in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
|
836 |
+
script_or_style.decompose()
|
|
|
|
|
|
|
|
|
837 |
main_content = None
|
838 |
+
for selector in ['main', 'article', '.content', '#content', '.post', '.entry-content', '.entry']: # Added .entry-content
|
839 |
main_content = soup.select_one(selector)
|
840 |
if main_content:
|
841 |
break
|
|
|
842 |
if not main_content:
|
843 |
main_content = soup.find('body') or soup
|
|
|
|
|
844 |
text = main_content.get_text(separator='\n', strip=True)
|
|
|
|
|
845 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
846 |
text = '\n'.join(lines)
|
|
|
|
|
847 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
848 |
text = re.sub(r' {2,}', ' ', text)
|
|
|
|
|
849 |
if len(text) > self.max_chars_per_page:
|
850 |
text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
|
|
|
851 |
return text
|
|
|
852 |
except requests.exceptions.Timeout:
|
853 |
+
logger.warning(f"Page loading timed out for {url}")
|
854 |
return "Page loading timed out"
|
855 |
except requests.exceptions.RequestException as e:
|
856 |
+
logger.warning(f"Failed to retrieve page {url}: {str(e)}")
|
857 |
return f"Failed to retrieve page: {str(e)}"
|
858 |
except Exception as e:
|
859 |
logger.error(f"Content extraction failed for {url}: {e}")
|
860 |
return "Failed to extract content from page"
|
861 |
+
|
862 |
def _format_search_result(self, result: Dict, content: str) -> str:
|
863 |
"""Format a single search result with its content."""
|
864 |
title = result.get('title', 'No title')
|
865 |
url = result.get('href', 'No URL')
|
866 |
snippet = result.get('body', 'No snippet')
|
867 |
+
return f"π **{title}**\nURL: {url}\nSnippet: {snippet}\n\nπ **Page Content:**\n{content}\n---\n"
|
868 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
869 |
def run(self, tool_input: Union[str, Dict]) -> str:
|
870 |
query_str: Optional[str] = None
|
871 |
|
872 |
if isinstance(tool_input, dict):
|
|
|
873 |
if "query" in tool_input and isinstance(tool_input["query"], str):
|
874 |
query_str = tool_input["query"]
|
875 |
elif "input" in tool_input and isinstance(tool_input["input"], str):
|
876 |
query_str = tool_input["input"]
|
|
|
877 |
else:
|
878 |
return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
|
879 |
elif isinstance(tool_input, str):
|
|
|
881 |
else:
|
882 |
return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
|
883 |
|
884 |
+
# The misplaced docstring """Execute the enhanced search.""" was removed from here.
|
885 |
+
|
886 |
+
# Use query_str consistently from now on
|
887 |
+
if not query_str or not query_str.strip():
|
888 |
return "Please provide a search query."
|
889 |
|
890 |
+
query_str = query_str.strip() # Apply strip to query_str
|
891 |
+
logger.info(f"Searching for: {query_str}") # Use query_str
|
892 |
|
893 |
+
search_results = self._search_duckduckgo(query_str) # Use query_str
|
|
|
894 |
|
895 |
if not search_results:
|
896 |
+
return f"No search results found for query: {query_str}" # Use query_str
|
897 |
|
|
|
898 |
enhanced_results = []
|
899 |
processed_count = 0
|
900 |
|
|
|
902 |
url = result.get('href', '')
|
903 |
if not url:
|
904 |
continue
|
|
|
905 |
logger.info(f"Processing result {i+1}: {url}")
|
|
|
|
|
906 |
content = self._extract_content_from_url(url)
|
907 |
+
if content and len(content.strip()) > 50:
|
|
|
908 |
formatted_result = self._format_search_result(result, content)
|
909 |
enhanced_results.append(formatted_result)
|
910 |
processed_count += 1
|
911 |
+
time.sleep(0.5) # Consider making this configurable or adjusting based on use case
|
|
|
|
|
912 |
|
913 |
if not enhanced_results:
|
914 |
+
return f"Search completed but no content could be extracted from the pages for query: {query_str}" # Use query_str
|
915 |
|
916 |
+
response = f"""π **Enhanced Search Results for: "{query_str}"**
|
|
|
917 |
Found {len(search_results)} results, successfully processed {processed_count} pages with content.
|
918 |
|
919 |
{''.join(enhanced_results)}
|
920 |
|
921 |
π‘ **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
|
922 |
+
""" # Use query_str
|
923 |
|
924 |
+
if len(response) > 12000: # This limit is arbitrary; consider if it should relate to self.max_chars_per_page
|
|
|
925 |
response = response[:12000] + "\n[Response truncated to prevent memory issues]"
|
926 |
|
927 |
return response
|
928 |
|
929 |
+
def _run(self, query_or_tool_input: Union[str, Dict]) -> str: # Updated to reflect run's input
|
930 |
+
"""Required by BaseTool interface. Handles various input types."""
|
931 |
+
# This _run method now correctly passes the input to the run method,
|
932 |
+
# which is designed to handle both string and dictionary inputs.
|
933 |
+
return self.run(query_or_tool_input)
|
934 |
+
|
935 |
# --- Agent State Definition ---
|
936 |
class AgentState(TypedDict):
|
937 |
messages: Annotated[List[AnyMessage], lambda x, y: x + y]
|