Spaces:

Agents-MCP-Hackathon
/

course-creator-ai

Running

File size: 16,237 Bytes

"""
🔍 Web Research Tools
Advanced web research using DuckDuckGo search and Crawl4AI content extraction
"""

import os
import requests
from typing import List, Dict, Any, Optional
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
import logging

# Try to import Crawl4AI, but have a fallback if it fails
try:
    from crawl4ai import (
        AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
        LLMContentFilter, DefaultMarkdownGenerator
    )
    CRAWL4AI_AVAILABLE = True
except ImportError:
    CRAWL4AI_AVAILABLE = False
    print("⚠️ Crawl4AI not available, using fallback web scraping")

logger = logging.getLogger(__name__)


class WebResearcher:
    """Advanced web research using DuckDuckGo and Crawl4AI"""

    def __init__(self, max_results: int = 10, max_crawl_pages: int = 7, llm_provider: str = None):
        self.max_results = max_results
        self.max_crawl_pages = max_crawl_pages
        self.llm_provider = llm_provider or "openai"  # Default fallback
        
        if CRAWL4AI_AVAILABLE:
            self.browser_config = BrowserConfig(
                headless=True,
                viewport_width=1280,
                viewport_height=720
            )
        else:
            self.browser_config = None
            print("🔄 Using fallback web scraping (requests + BeautifulSoup)")

    async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
        """Search for a topic using DuckDuckGo"""
        try:
            print(f"🔍 Searching DuckDuckGo for: {topic}")

            with DDGS() as ddgs:
                results = []
                search_results = ddgs.text(
                    keywords=topic,
                    region=region,
                    safesearch="moderate",
                    max_results=self.max_results
                )

                for result in search_results:
                    results.append({
                        "title": result.get("title", ""),
                        "url": result.get("href", ""),
                        "snippet": result.get("body", ""),
                        "source": "duckduckgo"
                    })

                print(f"✅ Found {len(results)} search results")
                return results

        except Exception as e:
            logger.error(f"Search failed: {e}")
            print(f"❌ Search failed: {e}")
            return []

    async def _fallback_extract_content(self, urls: List[str]) -> List[Dict[str, Any]]:
        """Fallback content extraction using requests and BeautifulSoup"""
        extracted_content = []
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        for i, url in enumerate(urls[:self.max_crawl_pages]):
            try:
                print(f"📖 Scraping {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")
                
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Remove script and style elements
                for script in soup(["script", "style", "nav", "footer", "header"]):
                    script.decompose()
                
                # Extract title
                title = ""
                if soup.title:
                    title = soup.title.string.strip()
                
                # Extract main content
                content_selectors = [
                    'main', 'article', '.content', '#content', 
                    '.post-content', '.entry-content', '.article-content'
                ]
                
                content = ""
                for selector in content_selectors:
                    content_elem = soup.select_one(selector)
                    if content_elem:
                        content = content_elem.get_text(separator='\n', strip=True)
                        break
                
                # If no specific content area found, use body
                if not content:
                    content = soup.get_text(separator='\n', strip=True)
                
                # Clean up content
                lines = [line.strip() for line in content.split('\n') if line.strip()]
                content = '\n'.join(lines)
                
                word_count = len(content.split())
                
                extracted_content.append({
                    "url": url,
                    "title": title,
                    "content": content,
                    "word_count": word_count,
                    "extraction_success": True
                })
                
                print(f"✅ Extracted {word_count} words from {url}")
                
            except Exception as e:
                logger.error(f"Error scraping {url}: {e}")
                print(f"❌ Error scraping {url}: {e}")
                extracted_content.append({
                    "url": url,
                    "title": "",
                    "content": "",
                    "word_count": 0,
                    "extraction_success": False,
                    "error": str(e)
                })
        
        successful_extractions = [c for c in extracted_content if c["extraction_success"]]
        print(f"✅ Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")
        
        return extracted_content

    async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
        """Extract content from URLs using Crawl4AI with LLM filtering"""
        
        # If Crawl4AI is not available, use fallback immediately
        if not CRAWL4AI_AVAILABLE:
            print("🔄 Using fallback content extraction (Crawl4AI not available)")
            return await self._fallback_extract_content(urls)
        
        # Check if Playwright browsers are installed
        try:
            from playwright.async_api import async_playwright
            async with async_playwright() as p:
                # Try to get browser path - this will fail if browsers aren't installed
                browser_path = p.chromium.executable_path
                if not browser_path or not os.path.exists(browser_path):
                    print("🔄 Playwright browsers not installed, using fallback content extraction")
                    return await self._fallback_extract_content(urls)
        except Exception as e:
            print(f"🔄 Playwright check failed ({e}), using fallback content extraction")
            return await self._fallback_extract_content(urls)
        
        try:
            print(f"📄 Extracting content from {len(urls)} URLs...")

            # Try to configure LLM content filter for educational content
            try:
                # Use the provider passed to the class, or fall back to environment/default
                crawl4ai_provider_simple = self.llm_provider

                # Map simple provider names to full provider/model format
                provider_mapping = {
                    "openai": "openai/gpt-4o-mini",
                    "google": "gemini/gemini-2.0-flash-exp",
                    "gemini": "gemini/gemini-2.0-flash-exp",
                    "anthropic": "gemini/gemini-2.0-flash-exp"  # Fallback since Crawl4AI doesn't support Anthropic directly
                }

                crawl4ai_provider = provider_mapping.get(crawl4ai_provider_simple, "openai/gpt-4o-mini")

                if crawl4ai_provider.startswith("gemini"):
                    # Check if Google API key is available
                    if not os.getenv("GOOGLE_API_KEY"):
                        print("⚠️ GOOGLE_API_KEY not found, falling back to OpenAI")
                        llm_config = LLMConfig(
                            provider="openai/gpt-4o-mini",
                            api_token="env:OPENAI_API_KEY"
                        )
                        print("🧠 Using OpenAI for content filtering: gpt-4o-mini (fallback)")
                    else:
                        llm_config = LLMConfig(
                            provider=crawl4ai_provider,
                            api_token="env:GOOGLE_API_KEY"
                        )
                        print(f"🧠 Using Gemini for content filtering: {crawl4ai_provider}")
                else:
                    # Default to OpenAI
                    llm_config = LLMConfig(
                        provider="openai/gpt-4o-mini",
                        api_token="env:OPENAI_API_KEY"
                    )
                    print("🧠 Using OpenAI for content filtering: gpt-4o-mini")

                content_filter = LLMContentFilter(
                    llm_config=llm_config,
                    instruction=f"""
                    Extract educational content related to "{topic}".
                    Focus on:
                    - Key concepts and explanations
                    - Practical examples and tutorials
                    - Technical details and specifications
                    - Best practices and guidelines
                    - Code examples and implementations

                    Exclude:
                    - Navigation menus and sidebars
                    - Advertisements and promotional content
                    - Footer content and legal text
                    - Unrelated content

                    Format as clean markdown with proper headers and code blocks.
                    """,
                    chunk_token_threshold=1000,
                    verbose=False
                )

                markdown_generator = DefaultMarkdownGenerator(
                    content_filter=content_filter,
                    options={"ignore_links": False}
                )
            except Exception as e:
                print(f"⚠️ Could not configure LLM content filter: {e}")
                # Fallback to basic markdown generator
                markdown_generator = DefaultMarkdownGenerator(
                    options={"ignore_links": False}
                )

            run_config = CrawlerRunConfig(
                markdown_generator=markdown_generator,
                cache_mode=CacheMode.BYPASS,
                wait_for_images=False,
                process_iframes=False,
                remove_overlay_elements=True
            )

            extracted_content = []

            async with AsyncWebCrawler(config=self.browser_config) as crawler:
                for i, url in enumerate(urls[:self.max_crawl_pages]):
                    try:
                        print(f"📖 Crawling {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")

                        result = await crawler.arun(url=url, config=run_config)

                        if result.success and result.markdown:
                            extracted_content.append({
                                "url": url,
                                "title": result.metadata.get("title", ""),
                                "content": result.markdown,
                                "word_count": len(result.markdown.split()),
                                "extraction_success": True
                            })
                            print(f"✅ Extracted {len(result.markdown.split())} words from {url}")
                        else:
                            print(f"⚠️ Failed to extract content from {url}: {result.error_message}")
                            extracted_content.append({
                                "url": url,
                                "title": "",
                                "content": "",
                                "word_count": 0,
                                "extraction_success": False,
                                "error": result.error_message
                            })

                    except Exception as e:
                        logger.error(f"Error crawling {url}: {e}")
                        print(f"❌ Error crawling {url}: {e}")
                        extracted_content.append({
                            "url": url,
                            "title": "",
                            "content": "",
                            "word_count": 0,
                            "extraction_success": False,
                            "error": str(e)
                        })

            successful_extractions = [c for c in extracted_content if c["extraction_success"]]
            print(f"✅ Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")

            return extracted_content

        except Exception as e:
            logger.error(f"Content extraction failed: {e}")
            print(f"❌ Content extraction failed: {e}")
            
            # If Crawl4AI fails (likely due to Playwright), try fallback
            error_str = str(e)
            playwright_errors = [
                "Executable doesn't exist",
                "BrowserType.launch", 
                "playwright install",
                "Playwright was just installed",
                "download new browsers",
                "chromium-",
                "chrome-linux/chrome"
            ]
            
            if any(error in error_str for error in playwright_errors):
                print("🔄 Playwright browser binaries not available, falling back to simple web scraping")
                return await self._fallback_extract_content(urls)
            
            return []

    async def research_topic(self, topic: str) -> Dict[str, Any]:
        """Complete research workflow: search + extract + summarize"""
        try:
            print(f"🚀 Starting comprehensive research for: {topic}")

            # Step 1: Search for relevant URLs
            search_results = await self.search_topic(topic)

            if not search_results:
                return {
                    "topic": topic,
                    "search_results": [],
                    "extracted_content": [],
                    "summary": f"No search results found for {topic}",
                    "success": False
                }

            # Step 2: Extract content from top URLs
            urls = [result["url"] for result in search_results]
            extracted_content = await self.extract_content(urls, topic)

            # Step 3: Compile research summary
            successful_content = [c for c in extracted_content if c["extraction_success"]]
            total_words = sum(c["word_count"] for c in successful_content)

            summary = f"""
            Research completed for "{topic}":
            - Found {len(search_results)} search results
            - Successfully extracted content from {len(successful_content)} sources
            - Total content: {total_words} words
            - Sources include educational articles, documentation, and tutorials
            """

            print(f"🎉 Research completed: {len(successful_content)} sources, {total_words} words")

            return {
                "topic": topic,
                "search_results": search_results,
                "extracted_content": extracted_content,
                "summary": summary.strip(),
                "total_words": total_words,
                "successful_sources": len(successful_content),
                "success": True
            }

        except Exception as e:
            logger.error(f"Research failed: {e}")
            print(f"❌ Research failed: {e}")
            return {
                "topic": topic,
                "search_results": [],
                "extracted_content": [],
                "summary": f"Research failed for {topic}: {str(e)}",
                "success": False
            }


async def research_topic(topic: str, llm_provider: str = "openai") -> Dict[str, Any]:
    """Convenience function for topic research with LLM provider"""
    web_researcher = WebResearcher(llm_provider=llm_provider)
    return await web_researcher.research_topic(topic)