azettl commited on
Commit
86dda1b
·
verified ·
1 Parent(s): acfb8b4

Delete research_tools/research_agent.py

Browse files
Files changed (1) hide show
  1. research_tools/research_agent.py +0 -487
research_tools/research_agent.py DELETED
@@ -1,487 +0,0 @@
1
- """
2
- Enhanced Research Agent with Multi-Source Integration
3
- """
4
- from typing import Dict, List, Any, Optional, Tuple
5
- import re
6
- from collections import Counter
7
-
8
- from .base_tool import BaseTool
9
- from .web_search import WebSearchTool
10
- from .wikipedia_search import WikipediaSearchTool
11
- from .arxiv_search import ArxivSearchTool
12
- from .github_search import GitHubSearchTool
13
- from .sec_search import SECSearchTool
14
-
15
-
16
- class EnhancedResearchAgent:
17
- """Enhanced research agent with multi-source synthesis and smart routing"""
18
-
19
- def __init__(self):
20
- # Initialize all research tools
21
- self.tools = {
22
- 'web': WebSearchTool(),
23
- 'wikipedia': WikipediaSearchTool(),
24
- 'arxiv': ArxivSearchTool(),
25
- 'github': GitHubSearchTool(),
26
- 'sec': SECSearchTool()
27
- }
28
-
29
- # Tool availability status
30
- self.tool_status = {name: True for name in self.tools.keys()}
31
-
32
- def search(self, query: str, research_depth: str = "standard") -> str:
33
- """Main search method with intelligent routing"""
34
- if research_depth == "deep":
35
- return self._deep_multi_source_search(query)
36
- else:
37
- return self._standard_search(query)
38
-
39
- def search_wikipedia(self, topic: str) -> str:
40
- """Wikipedia search method for backward compatibility"""
41
- return self.tools['wikipedia'].search(topic)
42
-
43
- def _standard_search(self, query: str) -> str:
44
- """Standard single-source search with smart routing"""
45
- # Determine best tool for the query
46
- best_tool = self._route_query_to_tool(query)
47
-
48
- try:
49
- return self.tools[best_tool].search(query)
50
- except Exception as e:
51
- # Fallback to web search
52
- if best_tool != 'web':
53
- try:
54
- return self.tools['web'].search(query)
55
- except Exception as e2:
56
- return f"**Research for: {query}**\n\nResearch temporarily unavailable: {str(e2)[:100]}..."
57
- else:
58
- return f"**Research for: {query}**\n\nResearch temporarily unavailable: {str(e)[:100]}..."
59
-
60
- def _deep_multi_source_search(self, query: str) -> str:
61
- """Deep research using multiple sources with synthesis"""
62
- results = {}
63
- quality_scores = {}
64
-
65
- # Determine which sources to use based on query type
66
- relevant_tools = self._get_relevant_tools(query)
67
-
68
- # Collect results from multiple sources
69
- for tool_name in relevant_tools:
70
- try:
71
- result = self.tools[tool_name].search(query)
72
- if result and len(result.strip()) > 50: # Ensure meaningful result
73
- results[tool_name] = result
74
- quality_scores[tool_name] = self.tools[tool_name].score_research_quality(result, tool_name)
75
- except Exception as e:
76
- print(f"Error with {tool_name}: {e}")
77
- continue
78
-
79
- if not results:
80
- return f"**Deep Research for: {query}**\n\nNo sources were able to provide results. Please try a different query."
81
-
82
- # Synthesize results
83
- return self._synthesize_multi_source_results(query, results, quality_scores)
84
-
85
- def _route_query_to_tool(self, query: str) -> str:
86
- """Intelligently route query to the most appropriate tool"""
87
- query_lower = query.lower()
88
-
89
- # Priority routing based on query characteristics
90
- for tool_name, tool in self.tools.items():
91
- if tool.should_use_for_query(query):
92
- # Return first matching tool based on priority order
93
- priority_order = ['arxiv', 'sec', 'github', 'wikipedia', 'web']
94
- if tool_name in priority_order[:3]: # High-priority specialized tools
95
- return tool_name
96
-
97
- # Secondary check for explicit indicators
98
- if any(indicator in query_lower for indicator in ['company', 'stock', 'financial', 'revenue']):
99
- return 'sec'
100
- elif any(indicator in query_lower for indicator in ['research', 'study', 'academic', 'paper']):
101
- return 'arxiv'
102
- elif any(indicator in query_lower for indicator in ['technology', 'framework', 'programming']):
103
- return 'github'
104
- elif any(indicator in query_lower for indicator in ['what is', 'definition', 'history']):
105
- return 'wikipedia'
106
- else:
107
- return 'web' # Default fallback
108
-
109
- def _get_relevant_tools(self, query: str) -> List[str]:
110
- """Get list of relevant tools for deep search"""
111
- relevant_tools = []
112
-
113
- # Always include web search for current information
114
- relevant_tools.append('web')
115
-
116
- # Add specialized tools based on query
117
- for tool_name, tool in self.tools.items():
118
- if tool_name != 'web' and tool.should_use_for_query(query):
119
- relevant_tools.append(tool_name)
120
-
121
- # Ensure we don't overwhelm with too many sources
122
- if len(relevant_tools) > 4:
123
- # Prioritize specialized tools
124
- priority_order = ['arxiv', 'sec', 'github', 'wikipedia', 'web']
125
- relevant_tools = [tool for tool in priority_order if tool in relevant_tools][:4]
126
-
127
- return relevant_tools
128
-
129
- def _synthesize_multi_source_results(self, query: str, results: Dict[str, str], quality_scores: Dict[str, Dict]) -> str:
130
- """Synthesize results from multiple research sources"""
131
- synthesis = f"**Comprehensive Research Analysis: {query}**\n\n"
132
-
133
- # Add source summary
134
- synthesis += f"**Research Sources Used:** {', '.join(results.keys()).replace('_', ' ').title()}\n\n"
135
-
136
- # Find key themes and agreements/disagreements
137
- key_findings = self._extract_key_findings(results)
138
- synthesis += self._format_key_findings(key_findings)
139
-
140
- # Add individual source results (condensed)
141
- synthesis += "**Detailed Source Results:**\n\n"
142
-
143
- # Sort sources by quality score
144
- sorted_sources = sorted(quality_scores.items(), key=lambda x: x[1]['overall'], reverse=True)
145
-
146
- for source_name, _ in sorted_sources:
147
- if source_name in results:
148
- source_result = results[source_name]
149
- quality = quality_scores[source_name]
150
-
151
- # Condense long results
152
- if len(source_result) > 800:
153
- source_result = source_result[:800] + "...\n[Result truncated for synthesis]"
154
-
155
- synthesis += f"**{source_name.replace('_', ' ').title()} (Quality: {quality['overall']:.2f}/1.0):**\n"
156
- synthesis += f"{source_result}\n\n"
157
-
158
- # Add research quality assessment
159
- synthesis += self._format_research_quality_assessment(quality_scores)
160
-
161
- return synthesis
162
-
163
- def _extract_key_findings(self, results: Dict[str, str]) -> Dict[str, List[str]]:
164
- """Extract key findings and themes from multiple sources"""
165
- findings = {
166
- 'agreements': [],
167
- 'contradictions': [],
168
- 'unique_insights': [],
169
- 'data_points': []
170
- }
171
-
172
- # Extract key sentences from each source
173
- all_sentences = []
174
- source_sentences = {}
175
-
176
- for source, result in results.items():
177
- sentences = self._extract_key_sentences(result)
178
- source_sentences[source] = sentences
179
- all_sentences.extend(sentences)
180
-
181
- # Find common themes (simplified approach)
182
- word_counts = Counter()
183
- for sentence in all_sentences:
184
- words = re.findall(r'\b\w{4,}\b', sentence.lower()) # Words 4+ chars
185
- word_counts.update(words)
186
-
187
- common_themes = [word for word, count in word_counts.most_common(10) if count > 1]
188
-
189
- # Look for numerical data
190
- numbers = re.findall(r'\b\d+(?:\.\d+)?%?\b', ' '.join(all_sentences))
191
- findings['data_points'] = list(set(numbers))[:10] # Top 10 unique numbers
192
-
193
- # Simplified agreement detection
194
- if len(source_sentences) > 1:
195
- findings['agreements'] = [f"Multiple sources mention: {theme}" for theme in common_themes[:3]]
196
-
197
- return findings
198
-
199
- def _extract_key_sentences(self, text: str) -> List[str]:
200
- """Extract key sentences from research text"""
201
- if not text:
202
- return []
203
-
204
- # Split into sentences
205
- sentences = re.split(r'[.!?]+', text)
206
-
207
- # Filter for key sentences (containing important indicators)
208
- key_indicators = [
209
- 'research shows', 'study found', 'according to', 'data indicates',
210
- 'results suggest', 'analysis reveals', 'evidence shows', 'reported that',
211
- 'concluded that', 'demonstrated that', 'increased', 'decreased',
212
- 'growth', 'decline', 'significant', 'important', 'critical'
213
- ]
214
-
215
- key_sentences = []
216
- for sentence in sentences:
217
- sentence = sentence.strip()
218
- if (len(sentence) > 30 and
219
- any(indicator in sentence.lower() for indicator in key_indicators)):
220
- key_sentences.append(sentence)
221
-
222
- return key_sentences[:5] # Top 5 key sentences
223
-
224
- def _format_key_findings(self, findings: Dict[str, List[str]]) -> str:
225
- """Format key findings summary"""
226
- result = "**Key Research Synthesis:**\n\n"
227
-
228
- if findings['agreements']:
229
- result += "**Common Themes:**\n"
230
- for agreement in findings['agreements']:
231
- result += f"• {agreement}\n"
232
- result += "\n"
233
-
234
- if findings['data_points']:
235
- result += "**Key Data Points:**\n"
236
- for data in findings['data_points'][:5]:
237
- result += f"• {data}\n"
238
- result += "\n"
239
-
240
- if findings['unique_insights']:
241
- result += "**Unique Insights:**\n"
242
- for insight in findings['unique_insights']:
243
- result += f"• {insight}\n"
244
- result += "\n"
245
-
246
- return result
247
-
248
- def _format_research_quality_assessment(self, quality_scores: Dict[str, Dict]) -> str:
249
- """Format overall research quality assessment"""
250
- if not quality_scores:
251
- return ""
252
-
253
- result = "**Research Quality Assessment:**\n\n"
254
-
255
- # Calculate average quality metrics
256
- avg_overall = sum(scores['overall'] for scores in quality_scores.values()) / len(quality_scores)
257
- avg_authority = sum(scores['authority'] for scores in quality_scores.values()) / len(quality_scores)
258
- avg_recency = sum(scores['recency'] for scores in quality_scores.values()) / len(quality_scores)
259
- avg_specificity = sum(scores['specificity'] for scores in quality_scores.values()) / len(quality_scores)
260
-
261
- result += f"• Overall Research Quality: {avg_overall:.2f}/1.0\n"
262
- result += f"• Source Authority: {avg_authority:.2f}/1.0\n"
263
- result += f"• Information Recency: {avg_recency:.2f}/1.0\n"
264
- result += f"• Data Specificity: {avg_specificity:.2f}/1.0\n"
265
- result += f"• Sources Consulted: {len(quality_scores)}\n\n"
266
-
267
- # Quality interpretation
268
- if avg_overall >= 0.8:
269
- quality_level = "Excellent"
270
- elif avg_overall >= 0.6:
271
- quality_level = "Good"
272
- elif avg_overall >= 0.4:
273
- quality_level = "Moderate"
274
- else:
275
- quality_level = "Limited"
276
-
277
- result += f"**Research Reliability: {quality_level}**\n"
278
-
279
- if avg_authority >= 0.8:
280
- result += "• High-authority sources with strong credibility\n"
281
- if avg_recency >= 0.7:
282
- result += "• Current and up-to-date information\n"
283
- if avg_specificity >= 0.6:
284
- result += "• Specific data points and quantitative evidence\n"
285
-
286
- return result
287
-
288
- def generate_research_queries(self, question: str, current_discussion: List[Dict]) -> List[str]:
289
- """Auto-generate targeted research queries based on discussion gaps"""
290
-
291
- # Analyze discussion for gaps
292
- discussion_text = "\n".join([msg.get('text', '') for msg in current_discussion])
293
-
294
- # Extract claims that need verification
295
- unsubstantiated_claims = self._find_unsubstantiated_claims(discussion_text)
296
-
297
- # Generate specific queries
298
- queries = []
299
-
300
- # Add queries for unsubstantiated claims
301
- for claim in unsubstantiated_claims[:3]:
302
- query = self._convert_claim_to_query(claim)
303
- if query:
304
- queries.append(query)
305
-
306
- # Add queries for missing quantitative data
307
- if not re.search(r'\d+%', discussion_text):
308
- queries.append(f"{question} statistics data percentages")
309
-
310
- # Add current trends query
311
- queries.append(f"{question} 2024 2025 recent developments")
312
-
313
- return queries[:3] # Limit to 3 targeted queries
314
-
315
- def _find_unsubstantiated_claims(self, discussion_text: str) -> List[str]:
316
- """Find claims that might need research backing"""
317
- claims = []
318
-
319
- # Look for assertion patterns
320
- assertion_patterns = [
321
- r'(?:should|must|will|is|are)\s+[^.]{20,100}',
322
- r'(?:studies show|research indicates|data suggests)\s+[^.]{20,100}',
323
- r'(?:according to|based on)\s+[^.]{20,100}'
324
- ]
325
-
326
- for pattern in assertion_patterns:
327
- matches = re.findall(pattern, discussion_text, re.IGNORECASE)
328
- claims.extend(matches[:2]) # Limit matches per pattern
329
-
330
- return claims
331
-
332
- def _convert_claim_to_query(self, claim: str) -> Optional[str]:
333
- """Convert a claim into a research query"""
334
- if not claim or len(claim) < 10:
335
- return None
336
-
337
- # Extract key terms
338
- key_terms = re.findall(r'\b\w{4,}\b', claim.lower())
339
- if len(key_terms) < 2:
340
- return None
341
-
342
- # Create query from key terms
343
- query_terms = key_terms[:4] # Use first 4 meaningful terms
344
- return " ".join(query_terms)
345
-
346
- def prioritize_research_needs(self, expert_positions: List[Dict], question: str) -> List[str]:
347
- """Identify and prioritize research that could resolve expert conflicts"""
348
-
349
- # Extract expert claims
350
- expert_claims = {}
351
- for position in expert_positions:
352
- speaker = position.get('speaker', 'Unknown')
353
- text = position.get('text', '')
354
- expert_claims[speaker] = self._extract_key_claims(text)
355
-
356
- # Find disagreements
357
- disagreements = self._find_expert_disagreements(expert_claims)
358
-
359
- # Generate research priorities
360
- priorities = []
361
-
362
- for disagreement in disagreements[:3]:
363
- # Create research query to resolve disagreement
364
- query = f"{question} {disagreement['topic']} evidence data"
365
- priorities.append(query)
366
-
367
- return priorities
368
-
369
- def _extract_key_claims(self, expert_text: str) -> List[str]:
370
- """Extract key factual claims from expert response"""
371
- if not expert_text:
372
- return []
373
-
374
- sentences = expert_text.split('.')
375
- claims = []
376
-
377
- for sentence in sentences:
378
- sentence = sentence.strip()
379
- if (len(sentence) > 20 and
380
- any(indicator in sentence.lower() for indicator in [
381
- 'should', 'will', 'is', 'are', 'must', 'can', 'would', 'could'
382
- ])):
383
- claims.append(sentence)
384
-
385
- return claims[:3] # Top 3 claims
386
-
387
- def _find_expert_disagreements(self, expert_claims: Dict[str, List[str]]) -> List[Dict]:
388
- """Identify areas where experts disagree"""
389
- disagreements = []
390
-
391
- experts = list(expert_claims.keys())
392
-
393
- for i, expert1 in enumerate(experts):
394
- for expert2 in experts[i+1:]:
395
- claims1 = expert_claims[expert1]
396
- claims2 = expert_claims[expert2]
397
-
398
- conflicts = self._find_conflicting_claims(claims1, claims2)
399
- if conflicts:
400
- disagreements.append({
401
- 'experts': [expert1, expert2],
402
- 'topic': self._extract_conflict_topic(conflicts[0]),
403
- 'conflicts': conflicts[:1] # Just the main conflict
404
- })
405
-
406
- return disagreements
407
-
408
- def _find_conflicting_claims(self, claims1: List[str], claims2: List[str]) -> List[str]:
409
- """Identify potentially conflicting claims (simplified)"""
410
- conflicts = []
411
-
412
- # Simple opposing sentiment detection
413
- opposing_pairs = [
414
- ('should', 'should not'), ('will', 'will not'), ('is', 'is not'),
415
- ('increase', 'decrease'), ('better', 'worse'), ('yes', 'no'),
416
- ('support', 'oppose'), ('benefit', 'harm'), ('effective', 'ineffective')
417
- ]
418
-
419
- for claim1 in claims1:
420
- for claim2 in claims2:
421
- for pos, neg in opposing_pairs:
422
- if pos in claim1.lower() and neg in claim2.lower():
423
- conflicts.append(f"{claim1} vs {claim2}")
424
- elif neg in claim1.lower() and pos in claim2.lower():
425
- conflicts.append(f"{claim1} vs {claim2}")
426
-
427
- return conflicts
428
-
429
- def _extract_conflict_topic(self, conflict: str) -> str:
430
- """Extract the main topic from a conflict description"""
431
- # Simple extraction of key terms
432
- words = re.findall(r'\b\w{4,}\b', conflict.lower())
433
- # Filter out common words
434
- stopwords = {'should', 'will', 'would', 'could', 'this', 'that', 'with', 'from', 'they', 'them'}
435
- topic_words = [word for word in words if word not in stopwords]
436
- return " ".join(topic_words[:3])
437
-
438
- def suggest_research_follow_ups(self, discussion_log: List[Dict], question: str) -> List[str]:
439
- """Suggest additional research questions based on discussion patterns"""
440
-
441
- # Get recent discussion
442
- latest_messages = discussion_log[-6:] if len(discussion_log) > 6 else discussion_log
443
- recent_text = "\n".join([msg.get('content', '') for msg in latest_messages])
444
-
445
- follow_ups = []
446
-
447
- # Look for unverified statistics
448
- if re.search(r'\d+%', recent_text):
449
- follow_ups.append(f"{question} statistics verification current data")
450
-
451
- # Look for trend mentions
452
- trend_keywords = ['trend', 'growing', 'increasing', 'declining', 'emerging']
453
- if any(keyword in recent_text.lower() for keyword in trend_keywords):
454
- follow_ups.append(f"{question} current trends 2024 2025")
455
-
456
- # Look for example mentions
457
- if 'example' in recent_text.lower() or 'case study' in recent_text.lower():
458
- follow_ups.append(f"{question} case studies examples evidence")
459
-
460
- return follow_ups[:3]
461
-
462
- def get_tool_status(self) -> Dict[str, bool]:
463
- """Get status of all research tools"""
464
- return {
465
- name: self.tool_status.get(name, True)
466
- for name in self.tools.keys()
467
- }
468
-
469
- def test_tool_connections(self) -> Dict[str, str]:
470
- """Test all research tool connections"""
471
- results = {}
472
-
473
- for name, tool in self.tools.items():
474
- try:
475
- # Simple test query
476
- test_result = tool.search("test", max_results=1)
477
- if test_result and len(test_result) > 20:
478
- results[name] = "✅ Working"
479
- self.tool_status[name] = True
480
- else:
481
- results[name] = "⚠️ Limited response"
482
- self.tool_status[name] = False
483
- except Exception as e:
484
- results[name] = f"❌ Error: {str(e)[:50]}..."
485
- self.tool_status[name] = False
486
-
487
- return results