sizzlebop's picture
Upload 34 files
8be2f43 verified
"""
πŸ” Web Research Tools
Advanced web research using DuckDuckGo search and Crawl4AI content extraction
"""
import os
import requests
from typing import List, Dict, Any, Optional
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
import logging
# Try to import Crawl4AI, but have a fallback if it fails
try:
from crawl4ai import (
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
LLMContentFilter, DefaultMarkdownGenerator
)
CRAWL4AI_AVAILABLE = True
except ImportError:
CRAWL4AI_AVAILABLE = False
print("⚠️ Crawl4AI not available, using fallback web scraping")
logger = logging.getLogger(__name__)
class WebResearcher:
"""Advanced web research using DuckDuckGo and Crawl4AI"""
def __init__(self, max_results: int = 10, max_crawl_pages: int = 7, llm_provider: str = None):
self.max_results = max_results
self.max_crawl_pages = max_crawl_pages
self.llm_provider = llm_provider or "openai" # Default fallback
if CRAWL4AI_AVAILABLE:
self.browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=720
)
else:
self.browser_config = None
print("πŸ”„ Using fallback web scraping (requests + BeautifulSoup)")
async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
"""Search for a topic using DuckDuckGo"""
try:
print(f"πŸ” Searching DuckDuckGo for: {topic}")
with DDGS() as ddgs:
results = []
search_results = ddgs.text(
keywords=topic,
region=region,
safesearch="moderate",
max_results=self.max_results
)
for result in search_results:
results.append({
"title": result.get("title", ""),
"url": result.get("href", ""),
"snippet": result.get("body", ""),
"source": "duckduckgo"
})
print(f"βœ… Found {len(results)} search results")
return results
except Exception as e:
logger.error(f"Search failed: {e}")
print(f"❌ Search failed: {e}")
return []
async def _fallback_extract_content(self, urls: List[str]) -> List[Dict[str, Any]]:
"""Fallback content extraction using requests and BeautifulSoup"""
extracted_content = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
for i, url in enumerate(urls[:self.max_crawl_pages]):
try:
print(f"πŸ“– Scraping {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Extract title
title = ""
if soup.title:
title = soup.title.string.strip()
# Extract main content
content_selectors = [
'main', 'article', '.content', '#content',
'.post-content', '.entry-content', '.article-content'
]
content = ""
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
content = content_elem.get_text(separator='\n', strip=True)
break
# If no specific content area found, use body
if not content:
content = soup.get_text(separator='\n', strip=True)
# Clean up content
lines = [line.strip() for line in content.split('\n') if line.strip()]
content = '\n'.join(lines)
word_count = len(content.split())
extracted_content.append({
"url": url,
"title": title,
"content": content,
"word_count": word_count,
"extraction_success": True
})
print(f"βœ… Extracted {word_count} words from {url}")
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
print(f"❌ Error scraping {url}: {e}")
extracted_content.append({
"url": url,
"title": "",
"content": "",
"word_count": 0,
"extraction_success": False,
"error": str(e)
})
successful_extractions = [c for c in extracted_content if c["extraction_success"]]
print(f"βœ… Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")
return extracted_content
async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
"""Extract content from URLs using Crawl4AI with LLM filtering"""
# If Crawl4AI is not available, use fallback immediately
if not CRAWL4AI_AVAILABLE:
print("πŸ”„ Using fallback content extraction (Crawl4AI not available)")
return await self._fallback_extract_content(urls)
# Check if Playwright browsers are installed
try:
from playwright.async_api import async_playwright
async with async_playwright() as p:
# Try to get browser path - this will fail if browsers aren't installed
browser_path = p.chromium.executable_path
if not browser_path or not os.path.exists(browser_path):
print("πŸ”„ Playwright browsers not installed, using fallback content extraction")
return await self._fallback_extract_content(urls)
except Exception as e:
print(f"πŸ”„ Playwright check failed ({e}), using fallback content extraction")
return await self._fallback_extract_content(urls)
try:
print(f"πŸ“„ Extracting content from {len(urls)} URLs...")
# Try to configure LLM content filter for educational content
try:
# Use the provider passed to the class, or fall back to environment/default
crawl4ai_provider_simple = self.llm_provider
# Map simple provider names to full provider/model format
provider_mapping = {
"openai": "openai/gpt-4o-mini",
"google": "gemini/gemini-2.0-flash-exp",
"gemini": "gemini/gemini-2.0-flash-exp",
"anthropic": "gemini/gemini-2.0-flash-exp" # Fallback since Crawl4AI doesn't support Anthropic directly
}
crawl4ai_provider = provider_mapping.get(crawl4ai_provider_simple, "openai/gpt-4o-mini")
if crawl4ai_provider.startswith("gemini"):
# Check if Google API key is available
if not os.getenv("GOOGLE_API_KEY"):
print("⚠️ GOOGLE_API_KEY not found, falling back to OpenAI")
llm_config = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY"
)
print("🧠 Using OpenAI for content filtering: gpt-4o-mini (fallback)")
else:
llm_config = LLMConfig(
provider=crawl4ai_provider,
api_token="env:GOOGLE_API_KEY"
)
print(f"🧠 Using Gemini for content filtering: {crawl4ai_provider}")
else:
# Default to OpenAI
llm_config = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY"
)
print("🧠 Using OpenAI for content filtering: gpt-4o-mini")
content_filter = LLMContentFilter(
llm_config=llm_config,
instruction=f"""
Extract educational content related to "{topic}".
Focus on:
- Key concepts and explanations
- Practical examples and tutorials
- Technical details and specifications
- Best practices and guidelines
- Code examples and implementations
Exclude:
- Navigation menus and sidebars
- Advertisements and promotional content
- Footer content and legal text
- Unrelated content
Format as clean markdown with proper headers and code blocks.
""",
chunk_token_threshold=1000,
verbose=False
)
markdown_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
options={"ignore_links": False}
)
except Exception as e:
print(f"⚠️ Could not configure LLM content filter: {e}")
# Fallback to basic markdown generator
markdown_generator = DefaultMarkdownGenerator(
options={"ignore_links": False}
)
run_config = CrawlerRunConfig(
markdown_generator=markdown_generator,
cache_mode=CacheMode.BYPASS,
wait_for_images=False,
process_iframes=False,
remove_overlay_elements=True
)
extracted_content = []
async with AsyncWebCrawler(config=self.browser_config) as crawler:
for i, url in enumerate(urls[:self.max_crawl_pages]):
try:
print(f"πŸ“– Crawling {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")
result = await crawler.arun(url=url, config=run_config)
if result.success and result.markdown:
extracted_content.append({
"url": url,
"title": result.metadata.get("title", ""),
"content": result.markdown,
"word_count": len(result.markdown.split()),
"extraction_success": True
})
print(f"βœ… Extracted {len(result.markdown.split())} words from {url}")
else:
print(f"⚠️ Failed to extract content from {url}: {result.error_message}")
extracted_content.append({
"url": url,
"title": "",
"content": "",
"word_count": 0,
"extraction_success": False,
"error": result.error_message
})
except Exception as e:
logger.error(f"Error crawling {url}: {e}")
print(f"❌ Error crawling {url}: {e}")
extracted_content.append({
"url": url,
"title": "",
"content": "",
"word_count": 0,
"extraction_success": False,
"error": str(e)
})
successful_extractions = [c for c in extracted_content if c["extraction_success"]]
print(f"βœ… Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")
return extracted_content
except Exception as e:
logger.error(f"Content extraction failed: {e}")
print(f"❌ Content extraction failed: {e}")
# If Crawl4AI fails (likely due to Playwright), try fallback
error_str = str(e)
playwright_errors = [
"Executable doesn't exist",
"BrowserType.launch",
"playwright install",
"Playwright was just installed",
"download new browsers",
"chromium-",
"chrome-linux/chrome"
]
if any(error in error_str for error in playwright_errors):
print("πŸ”„ Playwright browser binaries not available, falling back to simple web scraping")
return await self._fallback_extract_content(urls)
return []
async def research_topic(self, topic: str) -> Dict[str, Any]:
"""Complete research workflow: search + extract + summarize"""
try:
print(f"πŸš€ Starting comprehensive research for: {topic}")
# Step 1: Search for relevant URLs
search_results = await self.search_topic(topic)
if not search_results:
return {
"topic": topic,
"search_results": [],
"extracted_content": [],
"summary": f"No search results found for {topic}",
"success": False
}
# Step 2: Extract content from top URLs
urls = [result["url"] for result in search_results]
extracted_content = await self.extract_content(urls, topic)
# Step 3: Compile research summary
successful_content = [c for c in extracted_content if c["extraction_success"]]
total_words = sum(c["word_count"] for c in successful_content)
summary = f"""
Research completed for "{topic}":
- Found {len(search_results)} search results
- Successfully extracted content from {len(successful_content)} sources
- Total content: {total_words} words
- Sources include educational articles, documentation, and tutorials
"""
print(f"πŸŽ‰ Research completed: {len(successful_content)} sources, {total_words} words")
return {
"topic": topic,
"search_results": search_results,
"extracted_content": extracted_content,
"summary": summary.strip(),
"total_words": total_words,
"successful_sources": len(successful_content),
"success": True
}
except Exception as e:
logger.error(f"Research failed: {e}")
print(f"❌ Research failed: {e}")
return {
"topic": topic,
"search_results": [],
"extracted_content": [],
"summary": f"Research failed for {topic}: {str(e)}",
"success": False
}
async def research_topic(topic: str, llm_provider: str = "openai") -> Dict[str, Any]:
"""Convenience function for topic research with LLM provider"""
web_researcher = WebResearcher(llm_provider=llm_provider)
return await web_researcher.research_topic(topic)