sizzlebop commited on
Commit
8be2f43
Β·
verified Β·
1 Parent(s): 4d85aba

Upload 34 files

Browse files
coursecrafter/tools/__pycache__/web_research.cpython-311.pyc CHANGED
Binary files a/coursecrafter/tools/__pycache__/web_research.cpython-311.pyc and b/coursecrafter/tools/__pycache__/web_research.cpython-311.pyc differ
 
coursecrafter/tools/web_research.py CHANGED
@@ -4,14 +4,23 @@ Advanced web research using DuckDuckGo search and Crawl4AI content extraction
4
  """
5
 
6
  import os
 
7
  from typing import List, Dict, Any, Optional
8
  from duckduckgo_search import DDGS
9
- from crawl4ai import (
10
- AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
11
- LLMContentFilter, DefaultMarkdownGenerator
12
- )
13
  import logging
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  logger = logging.getLogger(__name__)
16
 
17
 
@@ -22,11 +31,16 @@ class WebResearcher:
22
  self.max_results = max_results
23
  self.max_crawl_pages = max_crawl_pages
24
  self.llm_provider = llm_provider or "openai" # Default fallback
25
- self.browser_config = BrowserConfig(
26
- headless=True,
27
- viewport_width=1280,
28
- viewport_height=720
29
- )
 
 
 
 
 
30
 
31
  async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
32
  """Search for a topic using DuckDuckGo"""
@@ -58,8 +72,103 @@ class WebResearcher:
58
  print(f"❌ Search failed: {e}")
59
  return []
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
62
  """Extract content from URLs using Crawl4AI with LLM filtering"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  try:
64
  print(f"πŸ“„ Extracting content from {len(urls)} URLs...")
65
 
@@ -192,6 +301,23 @@ class WebResearcher:
192
  except Exception as e:
193
  logger.error(f"Content extraction failed: {e}")
194
  print(f"❌ Content extraction failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  return []
196
 
197
  async def research_topic(self, topic: str) -> Dict[str, Any]:
 
4
  """
5
 
6
  import os
7
+ import requests
8
  from typing import List, Dict, Any, Optional
9
  from duckduckgo_search import DDGS
10
+ from bs4 import BeautifulSoup
 
 
 
11
  import logging
12
 
13
+ # Try to import Crawl4AI, but have a fallback if it fails
14
+ try:
15
+ from crawl4ai import (
16
+ AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
17
+ LLMContentFilter, DefaultMarkdownGenerator
18
+ )
19
+ CRAWL4AI_AVAILABLE = True
20
+ except ImportError:
21
+ CRAWL4AI_AVAILABLE = False
22
+ print("⚠️ Crawl4AI not available, using fallback web scraping")
23
+
24
  logger = logging.getLogger(__name__)
25
 
26
 
 
31
  self.max_results = max_results
32
  self.max_crawl_pages = max_crawl_pages
33
  self.llm_provider = llm_provider or "openai" # Default fallback
34
+
35
+ if CRAWL4AI_AVAILABLE:
36
+ self.browser_config = BrowserConfig(
37
+ headless=True,
38
+ viewport_width=1280,
39
+ viewport_height=720
40
+ )
41
+ else:
42
+ self.browser_config = None
43
+ print("πŸ”„ Using fallback web scraping (requests + BeautifulSoup)")
44
 
45
  async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
46
  """Search for a topic using DuckDuckGo"""
 
72
  print(f"❌ Search failed: {e}")
73
  return []
74
 
75
+ async def _fallback_extract_content(self, urls: List[str]) -> List[Dict[str, Any]]:
76
+ """Fallback content extraction using requests and BeautifulSoup"""
77
+ extracted_content = []
78
+
79
+ headers = {
80
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
81
+ }
82
+
83
+ for i, url in enumerate(urls[:self.max_crawl_pages]):
84
+ try:
85
+ print(f"πŸ“– Scraping {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")
86
+
87
+ response = requests.get(url, headers=headers, timeout=10)
88
+ response.raise_for_status()
89
+
90
+ soup = BeautifulSoup(response.content, 'html.parser')
91
+
92
+ # Remove script and style elements
93
+ for script in soup(["script", "style", "nav", "footer", "header"]):
94
+ script.decompose()
95
+
96
+ # Extract title
97
+ title = ""
98
+ if soup.title:
99
+ title = soup.title.string.strip()
100
+
101
+ # Extract main content
102
+ content_selectors = [
103
+ 'main', 'article', '.content', '#content',
104
+ '.post-content', '.entry-content', '.article-content'
105
+ ]
106
+
107
+ content = ""
108
+ for selector in content_selectors:
109
+ content_elem = soup.select_one(selector)
110
+ if content_elem:
111
+ content = content_elem.get_text(separator='\n', strip=True)
112
+ break
113
+
114
+ # If no specific content area found, use body
115
+ if not content:
116
+ content = soup.get_text(separator='\n', strip=True)
117
+
118
+ # Clean up content
119
+ lines = [line.strip() for line in content.split('\n') if line.strip()]
120
+ content = '\n'.join(lines)
121
+
122
+ word_count = len(content.split())
123
+
124
+ extracted_content.append({
125
+ "url": url,
126
+ "title": title,
127
+ "content": content,
128
+ "word_count": word_count,
129
+ "extraction_success": True
130
+ })
131
+
132
+ print(f"βœ… Extracted {word_count} words from {url}")
133
+
134
+ except Exception as e:
135
+ logger.error(f"Error scraping {url}: {e}")
136
+ print(f"❌ Error scraping {url}: {e}")
137
+ extracted_content.append({
138
+ "url": url,
139
+ "title": "",
140
+ "content": "",
141
+ "word_count": 0,
142
+ "extraction_success": False,
143
+ "error": str(e)
144
+ })
145
+
146
+ successful_extractions = [c for c in extracted_content if c["extraction_success"]]
147
+ print(f"βœ… Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")
148
+
149
+ return extracted_content
150
+
151
  async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
152
  """Extract content from URLs using Crawl4AI with LLM filtering"""
153
+
154
+ # If Crawl4AI is not available, use fallback immediately
155
+ if not CRAWL4AI_AVAILABLE:
156
+ print("πŸ”„ Using fallback content extraction (Crawl4AI not available)")
157
+ return await self._fallback_extract_content(urls)
158
+
159
+ # Check if Playwright browsers are installed
160
+ try:
161
+ from playwright.async_api import async_playwright
162
+ async with async_playwright() as p:
163
+ # Try to get browser path - this will fail if browsers aren't installed
164
+ browser_path = p.chromium.executable_path
165
+ if not browser_path or not os.path.exists(browser_path):
166
+ print("πŸ”„ Playwright browsers not installed, using fallback content extraction")
167
+ return await self._fallback_extract_content(urls)
168
+ except Exception as e:
169
+ print(f"πŸ”„ Playwright check failed ({e}), using fallback content extraction")
170
+ return await self._fallback_extract_content(urls)
171
+
172
  try:
173
  print(f"πŸ“„ Extracting content from {len(urls)} URLs...")
174
 
 
301
  except Exception as e:
302
  logger.error(f"Content extraction failed: {e}")
303
  print(f"❌ Content extraction failed: {e}")
304
+
305
+ # If Crawl4AI fails (likely due to Playwright), try fallback
306
+ error_str = str(e)
307
+ playwright_errors = [
308
+ "Executable doesn't exist",
309
+ "BrowserType.launch",
310
+ "playwright install",
311
+ "Playwright was just installed",
312
+ "download new browsers",
313
+ "chromium-",
314
+ "chrome-linux/chrome"
315
+ ]
316
+
317
+ if any(error in error_str for error in playwright_errors):
318
+ print("πŸ”„ Playwright browser binaries not available, falling back to simple web scraping")
319
+ return await self._fallback_extract_content(urls)
320
+
321
  return []
322
 
323
  async def research_topic(self, topic: str) -> Dict[str, Any]: