File size: 16,237 Bytes
94ecb74
 
 
 
 
 
8be2f43
94ecb74
 
8be2f43
94ecb74
 
8be2f43
 
 
 
 
 
 
 
 
 
 
94ecb74
 
 
 
 
 
 
 
 
 
8be2f43
 
 
 
 
 
 
 
 
 
94ecb74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8be2f43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94ecb74
 
8be2f43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94ecb74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8be2f43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94ecb74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
"""
πŸ” Web Research Tools
Advanced web research using DuckDuckGo search and Crawl4AI content extraction
"""

import os
import requests
from typing import List, Dict, Any, Optional
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
import logging

# Try to import Crawl4AI, but have a fallback if it fails
try:
    from crawl4ai import (
        AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
        LLMContentFilter, DefaultMarkdownGenerator
    )
    CRAWL4AI_AVAILABLE = True
except ImportError:
    CRAWL4AI_AVAILABLE = False
    print("⚠️ Crawl4AI not available, using fallback web scraping")

logger = logging.getLogger(__name__)


class WebResearcher:
    """Advanced web research using DuckDuckGo and Crawl4AI"""

    def __init__(self, max_results: int = 10, max_crawl_pages: int = 7, llm_provider: str = None):
        self.max_results = max_results
        self.max_crawl_pages = max_crawl_pages
        self.llm_provider = llm_provider or "openai"  # Default fallback
        
        if CRAWL4AI_AVAILABLE:
            self.browser_config = BrowserConfig(
                headless=True,
                viewport_width=1280,
                viewport_height=720
            )
        else:
            self.browser_config = None
            print("πŸ”„ Using fallback web scraping (requests + BeautifulSoup)")

    async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
        """Search for a topic using DuckDuckGo"""
        try:
            print(f"πŸ” Searching DuckDuckGo for: {topic}")

            with DDGS() as ddgs:
                results = []
                search_results = ddgs.text(
                    keywords=topic,
                    region=region,
                    safesearch="moderate",
                    max_results=self.max_results
                )

                for result in search_results:
                    results.append({
                        "title": result.get("title", ""),
                        "url": result.get("href", ""),
                        "snippet": result.get("body", ""),
                        "source": "duckduckgo"
                    })

                print(f"βœ… Found {len(results)} search results")
                return results

        except Exception as e:
            logger.error(f"Search failed: {e}")
            print(f"❌ Search failed: {e}")
            return []

    async def _fallback_extract_content(self, urls: List[str]) -> List[Dict[str, Any]]:
        """Fallback content extraction using requests and BeautifulSoup"""
        extracted_content = []
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        for i, url in enumerate(urls[:self.max_crawl_pages]):
            try:
                print(f"πŸ“– Scraping {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")
                
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Remove script and style elements
                for script in soup(["script", "style", "nav", "footer", "header"]):
                    script.decompose()
                
                # Extract title
                title = ""
                if soup.title:
                    title = soup.title.string.strip()
                
                # Extract main content
                content_selectors = [
                    'main', 'article', '.content', '#content', 
                    '.post-content', '.entry-content', '.article-content'
                ]
                
                content = ""
                for selector in content_selectors:
                    content_elem = soup.select_one(selector)
                    if content_elem:
                        content = content_elem.get_text(separator='\n', strip=True)
                        break
                
                # If no specific content area found, use body
                if not content:
                    content = soup.get_text(separator='\n', strip=True)
                
                # Clean up content
                lines = [line.strip() for line in content.split('\n') if line.strip()]
                content = '\n'.join(lines)
                
                word_count = len(content.split())
                
                extracted_content.append({
                    "url": url,
                    "title": title,
                    "content": content,
                    "word_count": word_count,
                    "extraction_success": True
                })
                
                print(f"βœ… Extracted {word_count} words from {url}")
                
            except Exception as e:
                logger.error(f"Error scraping {url}: {e}")
                print(f"❌ Error scraping {url}: {e}")
                extracted_content.append({
                    "url": url,
                    "title": "",
                    "content": "",
                    "word_count": 0,
                    "extraction_success": False,
                    "error": str(e)
                })
        
        successful_extractions = [c for c in extracted_content if c["extraction_success"]]
        print(f"βœ… Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")
        
        return extracted_content

    async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
        """Extract content from URLs using Crawl4AI with LLM filtering"""
        
        # If Crawl4AI is not available, use fallback immediately
        if not CRAWL4AI_AVAILABLE:
            print("πŸ”„ Using fallback content extraction (Crawl4AI not available)")
            return await self._fallback_extract_content(urls)
        
        # Check if Playwright browsers are installed
        try:
            from playwright.async_api import async_playwright
            async with async_playwright() as p:
                # Try to get browser path - this will fail if browsers aren't installed
                browser_path = p.chromium.executable_path
                if not browser_path or not os.path.exists(browser_path):
                    print("πŸ”„ Playwright browsers not installed, using fallback content extraction")
                    return await self._fallback_extract_content(urls)
        except Exception as e:
            print(f"πŸ”„ Playwright check failed ({e}), using fallback content extraction")
            return await self._fallback_extract_content(urls)
        
        try:
            print(f"πŸ“„ Extracting content from {len(urls)} URLs...")

            # Try to configure LLM content filter for educational content
            try:
                # Use the provider passed to the class, or fall back to environment/default
                crawl4ai_provider_simple = self.llm_provider

                # Map simple provider names to full provider/model format
                provider_mapping = {
                    "openai": "openai/gpt-4o-mini",
                    "google": "gemini/gemini-2.0-flash-exp",
                    "gemini": "gemini/gemini-2.0-flash-exp",
                    "anthropic": "gemini/gemini-2.0-flash-exp"  # Fallback since Crawl4AI doesn't support Anthropic directly
                }

                crawl4ai_provider = provider_mapping.get(crawl4ai_provider_simple, "openai/gpt-4o-mini")

                if crawl4ai_provider.startswith("gemini"):
                    # Check if Google API key is available
                    if not os.getenv("GOOGLE_API_KEY"):
                        print("⚠️ GOOGLE_API_KEY not found, falling back to OpenAI")
                        llm_config = LLMConfig(
                            provider="openai/gpt-4o-mini",
                            api_token="env:OPENAI_API_KEY"
                        )
                        print("🧠 Using OpenAI for content filtering: gpt-4o-mini (fallback)")
                    else:
                        llm_config = LLMConfig(
                            provider=crawl4ai_provider,
                            api_token="env:GOOGLE_API_KEY"
                        )
                        print(f"🧠 Using Gemini for content filtering: {crawl4ai_provider}")
                else:
                    # Default to OpenAI
                    llm_config = LLMConfig(
                        provider="openai/gpt-4o-mini",
                        api_token="env:OPENAI_API_KEY"
                    )
                    print("🧠 Using OpenAI for content filtering: gpt-4o-mini")

                content_filter = LLMContentFilter(
                    llm_config=llm_config,
                    instruction=f"""
                    Extract educational content related to "{topic}".
                    Focus on:
                    - Key concepts and explanations
                    - Practical examples and tutorials
                    - Technical details and specifications
                    - Best practices and guidelines
                    - Code examples and implementations

                    Exclude:
                    - Navigation menus and sidebars
                    - Advertisements and promotional content
                    - Footer content and legal text
                    - Unrelated content

                    Format as clean markdown with proper headers and code blocks.
                    """,
                    chunk_token_threshold=1000,
                    verbose=False
                )

                markdown_generator = DefaultMarkdownGenerator(
                    content_filter=content_filter,
                    options={"ignore_links": False}
                )
            except Exception as e:
                print(f"⚠️ Could not configure LLM content filter: {e}")
                # Fallback to basic markdown generator
                markdown_generator = DefaultMarkdownGenerator(
                    options={"ignore_links": False}
                )

            run_config = CrawlerRunConfig(
                markdown_generator=markdown_generator,
                cache_mode=CacheMode.BYPASS,
                wait_for_images=False,
                process_iframes=False,
                remove_overlay_elements=True
            )

            extracted_content = []

            async with AsyncWebCrawler(config=self.browser_config) as crawler:
                for i, url in enumerate(urls[:self.max_crawl_pages]):
                    try:
                        print(f"πŸ“– Crawling {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")

                        result = await crawler.arun(url=url, config=run_config)

                        if result.success and result.markdown:
                            extracted_content.append({
                                "url": url,
                                "title": result.metadata.get("title", ""),
                                "content": result.markdown,
                                "word_count": len(result.markdown.split()),
                                "extraction_success": True
                            })
                            print(f"βœ… Extracted {len(result.markdown.split())} words from {url}")
                        else:
                            print(f"⚠️ Failed to extract content from {url}: {result.error_message}")
                            extracted_content.append({
                                "url": url,
                                "title": "",
                                "content": "",
                                "word_count": 0,
                                "extraction_success": False,
                                "error": result.error_message
                            })

                    except Exception as e:
                        logger.error(f"Error crawling {url}: {e}")
                        print(f"❌ Error crawling {url}: {e}")
                        extracted_content.append({
                            "url": url,
                            "title": "",
                            "content": "",
                            "word_count": 0,
                            "extraction_success": False,
                            "error": str(e)
                        })

            successful_extractions = [c for c in extracted_content if c["extraction_success"]]
            print(f"βœ… Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")

            return extracted_content

        except Exception as e:
            logger.error(f"Content extraction failed: {e}")
            print(f"❌ Content extraction failed: {e}")
            
            # If Crawl4AI fails (likely due to Playwright), try fallback
            error_str = str(e)
            playwright_errors = [
                "Executable doesn't exist",
                "BrowserType.launch", 
                "playwright install",
                "Playwright was just installed",
                "download new browsers",
                "chromium-",
                "chrome-linux/chrome"
            ]
            
            if any(error in error_str for error in playwright_errors):
                print("πŸ”„ Playwright browser binaries not available, falling back to simple web scraping")
                return await self._fallback_extract_content(urls)
            
            return []

    async def research_topic(self, topic: str) -> Dict[str, Any]:
        """Complete research workflow: search + extract + summarize"""
        try:
            print(f"πŸš€ Starting comprehensive research for: {topic}")

            # Step 1: Search for relevant URLs
            search_results = await self.search_topic(topic)

            if not search_results:
                return {
                    "topic": topic,
                    "search_results": [],
                    "extracted_content": [],
                    "summary": f"No search results found for {topic}",
                    "success": False
                }

            # Step 2: Extract content from top URLs
            urls = [result["url"] for result in search_results]
            extracted_content = await self.extract_content(urls, topic)

            # Step 3: Compile research summary
            successful_content = [c for c in extracted_content if c["extraction_success"]]
            total_words = sum(c["word_count"] for c in successful_content)

            summary = f"""
            Research completed for "{topic}":
            - Found {len(search_results)} search results
            - Successfully extracted content from {len(successful_content)} sources
            - Total content: {total_words} words
            - Sources include educational articles, documentation, and tutorials
            """

            print(f"πŸŽ‰ Research completed: {len(successful_content)} sources, {total_words} words")

            return {
                "topic": topic,
                "search_results": search_results,
                "extracted_content": extracted_content,
                "summary": summary.strip(),
                "total_words": total_words,
                "successful_sources": len(successful_content),
                "success": True
            }

        except Exception as e:
            logger.error(f"Research failed: {e}")
            print(f"❌ Research failed: {e}")
            return {
                "topic": topic,
                "search_results": [],
                "extracted_content": [],
                "summary": f"Research failed for {topic}: {str(e)}",
                "success": False
            }


async def research_topic(topic: str, llm_provider: str = "openai") -> Dict[str, Any]:
    """Convenience function for topic research with LLM provider"""
    web_researcher = WebResearcher(llm_provider=llm_provider)
    return await web_researcher.research_topic(topic)