Denis Davydov commited on
Commit
7f6ab11
Β·
1 Parent(s): 913cae8

search also in general

Browse files
Files changed (1) hide show
  1. tools.py +147 -40
tools.py CHANGED
@@ -143,60 +143,63 @@ text_processor_tool = Tool(
143
  description="Processes text for various operations like summarization, number extraction, date extraction. Specify operation as second parameter."
144
  )
145
 
146
- def enhanced_web_retrieval_tool_func(query: str) -> str:
147
- """Enhanced web search with vector retrieval for deep content analysis."""
148
  try:
149
  print(f"πŸ” Enhanced web retrieval for: {query}")
150
 
151
- # Step 1: Get search results with URLs
152
- search_results = get_search_urls(query)
153
- if not search_results:
154
- return "No search results found."
155
 
156
- # Step 2: Fetch and process webpage content
157
- documents = []
158
- for result in search_results[:4]: # Top 4 results as requested
159
- url = result.get('url', '')
160
- title = result.get('title', 'No title')
161
-
162
- print(f"πŸ“„ Fetching content from: {title}")
163
- content = fetch_webpage_content(url)
164
- if content:
165
- doc = Document(
166
- page_content=content,
167
- metadata={"source": url, "title": title}
168
- )
169
- documents.append(doc)
170
 
171
- if not documents:
172
- return "Could not fetch content from any search results."
 
 
 
173
 
174
- # Step 3: Create vector store and search
175
- return search_documents_with_vector_store(documents, query)
176
 
177
  except Exception as e:
178
  return f"Enhanced web retrieval failed: {str(e)}"
179
 
180
- def get_search_urls(query: str) -> list:
181
- """Get search results from English Wikipedia only using DDGS."""
182
  try:
183
  with DDGS() as ddgs:
184
  # Create Wikipedia-specific search queries
185
- queries = [
186
  f"{query} site:en.wikipedia.org"
187
  ]
188
 
189
  search_results = []
190
  seen_urls = set()
191
 
192
- for wiki_query in queries:
193
  try:
194
- results = list(ddgs.text(wiki_query, max_results=10, region="us-en", backend="bing", safesearch="on"))
 
 
 
 
 
 
195
 
196
  for result in results:
197
  url = result.get('href', '')
198
 
199
- if url not in seen_urls:
 
200
  search_results.append({
201
  'url': url,
202
  'title': result.get('title', 'No title'),
@@ -204,15 +207,16 @@ def get_search_urls(query: str) -> list:
204
  })
205
  seen_urls.add(url)
206
 
207
- # Limit to 4 unique Wikipedia pages
208
- if len(search_results) >= 4:
209
  break
210
 
211
- if len(search_results) >= 4:
212
  break
213
 
214
- except Exception:
215
- continue # Try next query
 
216
 
217
  return search_results
218
 
@@ -220,6 +224,106 @@ def get_search_urls(query: str) -> list:
220
  print(f"Wikipedia search URL retrieval failed: {e}")
221
  return []
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  def fetch_webpage_content(url: str) -> str:
224
  """Fetch and extract clean text content from a webpage."""
225
  try:
@@ -251,7 +355,7 @@ def fetch_webpage_content(url: str) -> str:
251
  print(f"Failed to fetch content from {url}: {e}")
252
  return ""
253
 
254
- def search_documents_with_vector_store(documents: list, query: str) -> str:
255
  """Create vector store and search for relevant information."""
256
  try:
257
  # Split documents into chunks
@@ -273,14 +377,17 @@ def search_documents_with_vector_store(documents: list, query: str) -> str:
273
  # Search for relevant chunks with the original query
274
  relevant_docs = vectorstore.similarity_search(query, k=5)
275
 
276
- # Format results
277
  results = []
 
 
278
  for i, doc in enumerate(relevant_docs, 1):
279
  source = doc.metadata.get('source', 'Unknown source')
280
  title = doc.metadata.get('title', 'No title')
281
- content = doc.page_content[:5000] # First 500 chars
 
282
 
283
- results.append(f"Result {i} from {title}:\n{content}\nSource: {source}\n")
284
 
285
  return "\n---\n".join(results)
286
 
@@ -290,7 +397,7 @@ def search_documents_with_vector_store(documents: list, query: str) -> str:
290
  web_search_tool = Tool(
291
  name="enhanced_web_retrieval",
292
  func=enhanced_web_retrieval_tool_func,
293
- description="Enhanced Wikipedia-only search with vector retrieval. Fetches full content from English Wikipedia pages and uses semantic search to find relevant information. Use this for factual questions that need detailed Wikipedia content analysis."
294
  )
295
 
296
  # List of all tools for easy import
 
143
  description="Processes text for various operations like summarization, number extraction, date extraction. Specify operation as second parameter."
144
  )
145
 
146
+ def enhanced_web_retrieval_tool_func(query: str, backend: str = "bing") -> str:
147
+ """Enhanced web search with cascading fallback: Wikipedia first, then general web search."""
148
  try:
149
  print(f"πŸ” Enhanced web retrieval for: {query}")
150
 
151
+ # Step 1: Try Wikipedia search first
152
+ print("πŸ“š Searching Wikipedia...")
153
+ wikipedia_results = get_wikipedia_search_urls(query, backend)
 
154
 
155
+ if has_sufficient_results(wikipedia_results):
156
+ print(f"βœ… Found {len(wikipedia_results)} Wikipedia results")
157
+ documents = fetch_and_process_results(wikipedia_results, "Wikipedia")
158
+ if documents:
159
+ return search_documents_with_vector_store(documents, query, "Wikipedia")
160
+
161
+ # Step 2: Fallback to general web search
162
+ print("🌐 Wikipedia results insufficient, searching general web...")
163
+ web_results = get_general_web_search_urls(query, backend)
 
 
 
 
 
164
 
165
+ if web_results:
166
+ print(f"βœ… Found {len(web_results)} general web results")
167
+ documents = fetch_and_process_results(web_results, "General Web")
168
+ if documents:
169
+ return search_documents_with_vector_store(documents, query, "General Web")
170
 
171
+ return "No sufficient results found in Wikipedia or general web search."
 
172
 
173
  except Exception as e:
174
  return f"Enhanced web retrieval failed: {str(e)}"
175
 
176
+ def get_wikipedia_search_urls(query: str, backend: str = "auto") -> list:
177
+ """Get search results from English Wikipedia using DDGS."""
178
  try:
179
  with DDGS() as ddgs:
180
  # Create Wikipedia-specific search queries
181
+ wikipedia_queries = [
182
  f"{query} site:en.wikipedia.org"
183
  ]
184
 
185
  search_results = []
186
  seen_urls = set()
187
 
188
+ for wiki_query in wikipedia_queries:
189
  try:
190
+ results = list(ddgs.text(
191
+ wiki_query,
192
+ max_results=8,
193
+ region="us-en",
194
+ backend=backend,
195
+ safesearch="moderate"
196
+ ))
197
 
198
  for result in results:
199
  url = result.get('href', '')
200
 
201
+ # Only include Wikipedia URLs and avoid duplicates
202
+ if 'en.wikipedia.org' in url and url not in seen_urls:
203
  search_results.append({
204
  'url': url,
205
  'title': result.get('title', 'No title'),
 
207
  })
208
  seen_urls.add(url)
209
 
210
+ # Limit to 6 unique Wikipedia pages
211
+ if len(search_results) >= 6:
212
  break
213
 
214
+ if len(search_results) >= 6:
215
  break
216
 
217
+ except Exception as e:
218
+ print(f"Wikipedia search attempt failed: {e}")
219
+ continue
220
 
221
  return search_results
222
 
 
224
  print(f"Wikipedia search URL retrieval failed: {e}")
225
  return []
226
 
227
+ def get_general_web_search_urls(query: str, backend: str = "auto") -> list:
228
+ """Get search results from general web using DDGS."""
229
+ try:
230
+ with DDGS() as ddgs:
231
+ search_results = []
232
+ seen_urls = set()
233
+
234
+ try:
235
+ # General web search without site restriction
236
+ results = list(ddgs.text(
237
+ query,
238
+ max_results=8,
239
+ region="us-en",
240
+ backend=backend,
241
+ safesearch="moderate"
242
+ ))
243
+
244
+ for result in results:
245
+ url = result.get('href', '')
246
+
247
+ # Avoid duplicates and filter out low-quality sources
248
+ if url not in seen_urls and is_quality_source(url):
249
+ search_results.append({
250
+ 'url': url,
251
+ 'title': result.get('title', 'No title'),
252
+ 'snippet': result.get('body', 'No content')
253
+ })
254
+ seen_urls.add(url)
255
+
256
+ # Limit to 6 unique web pages
257
+ if len(search_results) >= 6:
258
+ break
259
+
260
+ except Exception as e:
261
+ print(f"General web search attempt failed: {e}")
262
+
263
+ return search_results
264
+
265
+ except Exception as e:
266
+ print(f"General web search URL retrieval failed: {e}")
267
+ return []
268
+
269
+ def is_quality_source(url: str) -> bool:
270
+ """Filter out low-quality or problematic sources."""
271
+ low_quality_domains = [
272
+ 'pinterest.com', 'instagram.com', 'facebook.com', 'twitter.com',
273
+ 'tiktok.com', 'youtube.com', 'reddit.com'
274
+ ]
275
+
276
+ for domain in low_quality_domains:
277
+ if domain in url.lower():
278
+ return False
279
+
280
+ return True
281
+
282
+ def has_sufficient_results(results: list) -> bool:
283
+ """Check if search results are sufficient to proceed."""
284
+ if not results:
285
+ return False
286
+
287
+ # Check for minimum number of results
288
+ if len(results) < 2:
289
+ return False
290
+
291
+ # Check if results have meaningful content
292
+ meaningful_results = 0
293
+ for result in results:
294
+ snippet = result.get('snippet', '')
295
+ title = result.get('title', '')
296
+
297
+ # Consider result meaningful if it has substantial content
298
+ if len(snippet) > 50 or len(title) > 10:
299
+ meaningful_results += 1
300
+
301
+ return meaningful_results >= 2
302
+
303
+ def fetch_and_process_results(results: list, source_type: str) -> list:
304
+ """Fetch and process webpage content from search results."""
305
+ documents = []
306
+
307
+ for result in results[:4]: # Process top 4 results
308
+ url = result.get('url', '')
309
+ title = result.get('title', 'No title')
310
+
311
+ print(f"πŸ“„ Fetching content from: {title}")
312
+ content = fetch_webpage_content(url)
313
+
314
+ if content and len(content.strip()) > 100: # Ensure meaningful content
315
+ doc = Document(
316
+ page_content=content,
317
+ metadata={
318
+ "source": url,
319
+ "title": title,
320
+ "source_type": source_type
321
+ }
322
+ )
323
+ documents.append(doc)
324
+
325
+ return documents
326
+
327
  def fetch_webpage_content(url: str) -> str:
328
  """Fetch and extract clean text content from a webpage."""
329
  try:
 
355
  print(f"Failed to fetch content from {url}: {e}")
356
  return ""
357
 
358
+ def search_documents_with_vector_store(documents: list, query: str, source_type: str = "Web") -> str:
359
  """Create vector store and search for relevant information."""
360
  try:
361
  # Split documents into chunks
 
377
  # Search for relevant chunks with the original query
378
  relevant_docs = vectorstore.similarity_search(query, k=5)
379
 
380
+ # Format results with source type indication
381
  results = []
382
+ results.append(f"πŸ” Search Results from {source_type}:\n")
383
+
384
  for i, doc in enumerate(relevant_docs, 1):
385
  source = doc.metadata.get('source', 'Unknown source')
386
  title = doc.metadata.get('title', 'No title')
387
+ source_type_meta = doc.metadata.get('source_type', source_type)
388
+ content = doc.page_content[:2000] # Increased content length
389
 
390
+ results.append(f"Result {i} ({source_type_meta}) - {title}:\n{content}\nSource: {source}\n")
391
 
392
  return "\n---\n".join(results)
393
 
 
397
  web_search_tool = Tool(
398
  name="enhanced_web_retrieval",
399
  func=enhanced_web_retrieval_tool_func,
400
+ description="Enhanced cascading web search with vector retrieval. First searches Wikipedia for reliable factual information, then falls back to general web search if insufficient results are found. Supports multiple search backends (auto, html, lite, bing) and uses semantic search to find relevant information. Ideal for comprehensive research on any topic."
401
  )
402
 
403
  # List of all tools for easy import