Denis Davydov
commited on
Commit
Β·
7f6ab11
1
Parent(s):
913cae8
search also in general
Browse files
tools.py
CHANGED
@@ -143,60 +143,63 @@ text_processor_tool = Tool(
|
|
143 |
description="Processes text for various operations like summarization, number extraction, date extraction. Specify operation as second parameter."
|
144 |
)
|
145 |
|
146 |
-
def enhanced_web_retrieval_tool_func(query: str) -> str:
|
147 |
-
"""Enhanced web search with
|
148 |
try:
|
149 |
print(f"π Enhanced web retrieval for: {query}")
|
150 |
|
151 |
-
# Step 1:
|
152 |
-
|
153 |
-
|
154 |
-
return "No search results found."
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
doc = Document(
|
166 |
-
page_content=content,
|
167 |
-
metadata={"source": url, "title": title}
|
168 |
-
)
|
169 |
-
documents.append(doc)
|
170 |
|
171 |
-
if
|
172 |
-
|
|
|
|
|
|
|
173 |
|
174 |
-
|
175 |
-
return search_documents_with_vector_store(documents, query)
|
176 |
|
177 |
except Exception as e:
|
178 |
return f"Enhanced web retrieval failed: {str(e)}"
|
179 |
|
180 |
-
def
|
181 |
-
"""Get search results from English Wikipedia
|
182 |
try:
|
183 |
with DDGS() as ddgs:
|
184 |
# Create Wikipedia-specific search queries
|
185 |
-
|
186 |
f"{query} site:en.wikipedia.org"
|
187 |
]
|
188 |
|
189 |
search_results = []
|
190 |
seen_urls = set()
|
191 |
|
192 |
-
for wiki_query in
|
193 |
try:
|
194 |
-
results = list(ddgs.text(
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
for result in results:
|
197 |
url = result.get('href', '')
|
198 |
|
199 |
-
|
|
|
200 |
search_results.append({
|
201 |
'url': url,
|
202 |
'title': result.get('title', 'No title'),
|
@@ -204,15 +207,16 @@ def get_search_urls(query: str) -> list:
|
|
204 |
})
|
205 |
seen_urls.add(url)
|
206 |
|
207 |
-
# Limit to
|
208 |
-
if len(search_results) >=
|
209 |
break
|
210 |
|
211 |
-
if len(search_results) >=
|
212 |
break
|
213 |
|
214 |
-
except Exception:
|
215 |
-
|
|
|
216 |
|
217 |
return search_results
|
218 |
|
@@ -220,6 +224,106 @@ def get_search_urls(query: str) -> list:
|
|
220 |
print(f"Wikipedia search URL retrieval failed: {e}")
|
221 |
return []
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
def fetch_webpage_content(url: str) -> str:
|
224 |
"""Fetch and extract clean text content from a webpage."""
|
225 |
try:
|
@@ -251,7 +355,7 @@ def fetch_webpage_content(url: str) -> str:
|
|
251 |
print(f"Failed to fetch content from {url}: {e}")
|
252 |
return ""
|
253 |
|
254 |
-
def search_documents_with_vector_store(documents: list, query: str) -> str:
|
255 |
"""Create vector store and search for relevant information."""
|
256 |
try:
|
257 |
# Split documents into chunks
|
@@ -273,14 +377,17 @@ def search_documents_with_vector_store(documents: list, query: str) -> str:
|
|
273 |
# Search for relevant chunks with the original query
|
274 |
relevant_docs = vectorstore.similarity_search(query, k=5)
|
275 |
|
276 |
-
# Format results
|
277 |
results = []
|
|
|
|
|
278 |
for i, doc in enumerate(relevant_docs, 1):
|
279 |
source = doc.metadata.get('source', 'Unknown source')
|
280 |
title = doc.metadata.get('title', 'No title')
|
281 |
-
|
|
|
282 |
|
283 |
-
results.append(f"Result {i}
|
284 |
|
285 |
return "\n---\n".join(results)
|
286 |
|
@@ -290,7 +397,7 @@ def search_documents_with_vector_store(documents: list, query: str) -> str:
|
|
290 |
web_search_tool = Tool(
|
291 |
name="enhanced_web_retrieval",
|
292 |
func=enhanced_web_retrieval_tool_func,
|
293 |
-
description="Enhanced
|
294 |
)
|
295 |
|
296 |
# List of all tools for easy import
|
|
|
143 |
description="Processes text for various operations like summarization, number extraction, date extraction. Specify operation as second parameter."
|
144 |
)
|
145 |
|
146 |
+
def enhanced_web_retrieval_tool_func(query: str, backend: str = "bing") -> str:
|
147 |
+
"""Enhanced web search with cascading fallback: Wikipedia first, then general web search."""
|
148 |
try:
|
149 |
print(f"π Enhanced web retrieval for: {query}")
|
150 |
|
151 |
+
# Step 1: Try Wikipedia search first
|
152 |
+
print("π Searching Wikipedia...")
|
153 |
+
wikipedia_results = get_wikipedia_search_urls(query, backend)
|
|
|
154 |
|
155 |
+
if has_sufficient_results(wikipedia_results):
|
156 |
+
print(f"β
Found {len(wikipedia_results)} Wikipedia results")
|
157 |
+
documents = fetch_and_process_results(wikipedia_results, "Wikipedia")
|
158 |
+
if documents:
|
159 |
+
return search_documents_with_vector_store(documents, query, "Wikipedia")
|
160 |
+
|
161 |
+
# Step 2: Fallback to general web search
|
162 |
+
print("π Wikipedia results insufficient, searching general web...")
|
163 |
+
web_results = get_general_web_search_urls(query, backend)
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
+
if web_results:
|
166 |
+
print(f"β
Found {len(web_results)} general web results")
|
167 |
+
documents = fetch_and_process_results(web_results, "General Web")
|
168 |
+
if documents:
|
169 |
+
return search_documents_with_vector_store(documents, query, "General Web")
|
170 |
|
171 |
+
return "No sufficient results found in Wikipedia or general web search."
|
|
|
172 |
|
173 |
except Exception as e:
|
174 |
return f"Enhanced web retrieval failed: {str(e)}"
|
175 |
|
176 |
+
def get_wikipedia_search_urls(query: str, backend: str = "auto") -> list:
|
177 |
+
"""Get search results from English Wikipedia using DDGS."""
|
178 |
try:
|
179 |
with DDGS() as ddgs:
|
180 |
# Create Wikipedia-specific search queries
|
181 |
+
wikipedia_queries = [
|
182 |
f"{query} site:en.wikipedia.org"
|
183 |
]
|
184 |
|
185 |
search_results = []
|
186 |
seen_urls = set()
|
187 |
|
188 |
+
for wiki_query in wikipedia_queries:
|
189 |
try:
|
190 |
+
results = list(ddgs.text(
|
191 |
+
wiki_query,
|
192 |
+
max_results=8,
|
193 |
+
region="us-en",
|
194 |
+
backend=backend,
|
195 |
+
safesearch="moderate"
|
196 |
+
))
|
197 |
|
198 |
for result in results:
|
199 |
url = result.get('href', '')
|
200 |
|
201 |
+
# Only include Wikipedia URLs and avoid duplicates
|
202 |
+
if 'en.wikipedia.org' in url and url not in seen_urls:
|
203 |
search_results.append({
|
204 |
'url': url,
|
205 |
'title': result.get('title', 'No title'),
|
|
|
207 |
})
|
208 |
seen_urls.add(url)
|
209 |
|
210 |
+
# Limit to 6 unique Wikipedia pages
|
211 |
+
if len(search_results) >= 6:
|
212 |
break
|
213 |
|
214 |
+
if len(search_results) >= 6:
|
215 |
break
|
216 |
|
217 |
+
except Exception as e:
|
218 |
+
print(f"Wikipedia search attempt failed: {e}")
|
219 |
+
continue
|
220 |
|
221 |
return search_results
|
222 |
|
|
|
224 |
print(f"Wikipedia search URL retrieval failed: {e}")
|
225 |
return []
|
226 |
|
227 |
+
def get_general_web_search_urls(query: str, backend: str = "auto") -> list:
|
228 |
+
"""Get search results from general web using DDGS."""
|
229 |
+
try:
|
230 |
+
with DDGS() as ddgs:
|
231 |
+
search_results = []
|
232 |
+
seen_urls = set()
|
233 |
+
|
234 |
+
try:
|
235 |
+
# General web search without site restriction
|
236 |
+
results = list(ddgs.text(
|
237 |
+
query,
|
238 |
+
max_results=8,
|
239 |
+
region="us-en",
|
240 |
+
backend=backend,
|
241 |
+
safesearch="moderate"
|
242 |
+
))
|
243 |
+
|
244 |
+
for result in results:
|
245 |
+
url = result.get('href', '')
|
246 |
+
|
247 |
+
# Avoid duplicates and filter out low-quality sources
|
248 |
+
if url not in seen_urls and is_quality_source(url):
|
249 |
+
search_results.append({
|
250 |
+
'url': url,
|
251 |
+
'title': result.get('title', 'No title'),
|
252 |
+
'snippet': result.get('body', 'No content')
|
253 |
+
})
|
254 |
+
seen_urls.add(url)
|
255 |
+
|
256 |
+
# Limit to 6 unique web pages
|
257 |
+
if len(search_results) >= 6:
|
258 |
+
break
|
259 |
+
|
260 |
+
except Exception as e:
|
261 |
+
print(f"General web search attempt failed: {e}")
|
262 |
+
|
263 |
+
return search_results
|
264 |
+
|
265 |
+
except Exception as e:
|
266 |
+
print(f"General web search URL retrieval failed: {e}")
|
267 |
+
return []
|
268 |
+
|
269 |
+
def is_quality_source(url: str) -> bool:
|
270 |
+
"""Filter out low-quality or problematic sources."""
|
271 |
+
low_quality_domains = [
|
272 |
+
'pinterest.com', 'instagram.com', 'facebook.com', 'twitter.com',
|
273 |
+
'tiktok.com', 'youtube.com', 'reddit.com'
|
274 |
+
]
|
275 |
+
|
276 |
+
for domain in low_quality_domains:
|
277 |
+
if domain in url.lower():
|
278 |
+
return False
|
279 |
+
|
280 |
+
return True
|
281 |
+
|
282 |
+
def has_sufficient_results(results: list) -> bool:
|
283 |
+
"""Check if search results are sufficient to proceed."""
|
284 |
+
if not results:
|
285 |
+
return False
|
286 |
+
|
287 |
+
# Check for minimum number of results
|
288 |
+
if len(results) < 2:
|
289 |
+
return False
|
290 |
+
|
291 |
+
# Check if results have meaningful content
|
292 |
+
meaningful_results = 0
|
293 |
+
for result in results:
|
294 |
+
snippet = result.get('snippet', '')
|
295 |
+
title = result.get('title', '')
|
296 |
+
|
297 |
+
# Consider result meaningful if it has substantial content
|
298 |
+
if len(snippet) > 50 or len(title) > 10:
|
299 |
+
meaningful_results += 1
|
300 |
+
|
301 |
+
return meaningful_results >= 2
|
302 |
+
|
303 |
+
def fetch_and_process_results(results: list, source_type: str) -> list:
|
304 |
+
"""Fetch and process webpage content from search results."""
|
305 |
+
documents = []
|
306 |
+
|
307 |
+
for result in results[:4]: # Process top 4 results
|
308 |
+
url = result.get('url', '')
|
309 |
+
title = result.get('title', 'No title')
|
310 |
+
|
311 |
+
print(f"π Fetching content from: {title}")
|
312 |
+
content = fetch_webpage_content(url)
|
313 |
+
|
314 |
+
if content and len(content.strip()) > 100: # Ensure meaningful content
|
315 |
+
doc = Document(
|
316 |
+
page_content=content,
|
317 |
+
metadata={
|
318 |
+
"source": url,
|
319 |
+
"title": title,
|
320 |
+
"source_type": source_type
|
321 |
+
}
|
322 |
+
)
|
323 |
+
documents.append(doc)
|
324 |
+
|
325 |
+
return documents
|
326 |
+
|
327 |
def fetch_webpage_content(url: str) -> str:
|
328 |
"""Fetch and extract clean text content from a webpage."""
|
329 |
try:
|
|
|
355 |
print(f"Failed to fetch content from {url}: {e}")
|
356 |
return ""
|
357 |
|
358 |
+
def search_documents_with_vector_store(documents: list, query: str, source_type: str = "Web") -> str:
|
359 |
"""Create vector store and search for relevant information."""
|
360 |
try:
|
361 |
# Split documents into chunks
|
|
|
377 |
# Search for relevant chunks with the original query
|
378 |
relevant_docs = vectorstore.similarity_search(query, k=5)
|
379 |
|
380 |
+
# Format results with source type indication
|
381 |
results = []
|
382 |
+
results.append(f"π Search Results from {source_type}:\n")
|
383 |
+
|
384 |
for i, doc in enumerate(relevant_docs, 1):
|
385 |
source = doc.metadata.get('source', 'Unknown source')
|
386 |
title = doc.metadata.get('title', 'No title')
|
387 |
+
source_type_meta = doc.metadata.get('source_type', source_type)
|
388 |
+
content = doc.page_content[:2000] # Increased content length
|
389 |
|
390 |
+
results.append(f"Result {i} ({source_type_meta}) - {title}:\n{content}\nSource: {source}\n")
|
391 |
|
392 |
return "\n---\n".join(results)
|
393 |
|
|
|
397 |
web_search_tool = Tool(
|
398 |
name="enhanced_web_retrieval",
|
399 |
func=enhanced_web_retrieval_tool_func,
|
400 |
+
description="Enhanced cascading web search with vector retrieval. First searches Wikipedia for reliable factual information, then falls back to general web search if insufficient results are found. Supports multiple search backends (auto, html, lite, bing) and uses semantic search to find relevant information. Ideal for comprehensive research on any topic."
|
401 |
)
|
402 |
|
403 |
# List of all tools for easy import
|