milwright commited on
Commit
f3a35a2
·
1 Parent(s): df94830

Integrate Crawl4AI for enhanced web scraping

Browse files

- Replace BeautifulSoup with Crawl4AI for better content extraction
- Create dedicated scraping_service.py module with Crawl4AIScraper class
- Update Chat Support assistant to use Crawl4AI for grounding URLs
- Update deployment package template to include Crawl4AI
- Handle async/sync conversion for Gradio compatibility
- Update requirements.txt with flexible version constraints

Files changed (3) hide show
  1. app.py +34 -34
  2. requirements.txt +5 -4
  3. scraping_service.py +146 -0
app.py CHANGED
@@ -7,6 +7,7 @@ from datetime import datetime
7
  from dotenv import load_dotenv
8
  import requests
9
  from bs4 import BeautifulSoup
 
10
 
11
  # Load environment variables from .env file
12
  load_dotenv()
@@ -16,7 +17,8 @@ SPACE_TEMPLATE = '''import gradio as gr
16
  import os
17
  import requests
18
  import json
19
- from bs4 import BeautifulSoup
 
20
 
21
  # Configuration
22
  SPACE_NAME = "{name}"
@@ -28,30 +30,36 @@ GROUNDING_URLS = {grounding_urls}
28
  # Get API key from environment - customizable variable name
29
  API_KEY = os.environ.get("{api_key_var}")
30
 
31
- def fetch_url_content(url):
32
- """Fetch and extract text content from a URL"""
33
  try:
34
- response = requests.get(url, timeout=10)
35
- response.raise_for_status()
36
- soup = BeautifulSoup(response.content, 'html.parser')
37
-
38
- # Remove script and style elements
39
- for script in soup(["script", "style"]):
40
- script.decompose()
41
-
42
- # Get text content
43
- text = soup.get_text()
44
-
45
- # Clean up whitespace
46
- lines = (line.strip() for line in text.splitlines())
47
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
48
- text = ' '.join(chunk for chunk in chunks if chunk)
49
-
50
- # Truncate to ~4000 characters
51
- if len(text) > 4000:
52
- text = text[:4000] + "..."
53
 
54
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  except Exception as e:
56
  return f"Error fetching {{url}}: {{str(e)}}"
57
 
@@ -139,18 +147,10 @@ if __name__ == "__main__":
139
  # Available models
140
  MODELS = [
141
  "google/gemma-3-27b-it",
142
- <<<<<<< HEAD
143
- "mistralai/mixtral-8x7b-instruct",
144
- "meta-llama/llama-3.1-70b-instruct",
145
- "anthropic/claude-3.5-haiku",
146
- "nvidia/nemotron-4-340b-instruct",
147
- "openai/gpt-3.5-turbo"
148
- =======
149
  "google/gemini-2.0-flash-001",
150
  "mistralai/mistral-medium",
151
  "openai/gpt-4o-nano",
152
  "anthropic/claude-3.5-haiku"
153
- >>>>>>> c997ea6 (Update model selection to five current models and remove cost information)
154
  ]
155
 
156
  def fetch_url_content(url):
@@ -282,7 +282,7 @@ Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} with Chat U/I Helper
282
 
283
  def create_requirements():
284
  """Generate requirements.txt"""
285
- return "gradio==4.44.1\nrequests==2.32.3\nbeautifulsoup4==4.12.3"
286
 
287
  def generate_zip(name, description, system_prompt, model, api_key_var, temperature, max_tokens, examples_text, url1="", url2="", url3="", url4=""):
288
  """Generate deployable zip file"""
@@ -386,9 +386,9 @@ def respond(message, chat_history, url1="", url2="", url3="", url4=""):
386
  chat_history.append([message, response])
387
  return "", chat_history
388
 
389
- # Get grounding context from URLs
390
  grounding_urls = [url1, url2, url3, url4]
391
- grounding_context = get_grounding_context(grounding_urls)
392
 
393
  # Build enhanced system prompt with grounding context
394
  base_system_prompt = """You are an expert assistant specializing in Gradio configurations for HuggingFace Spaces. You have deep knowledge of:
 
7
  from dotenv import load_dotenv
8
  import requests
9
  from bs4 import BeautifulSoup
10
+ from scraping_service import get_grounding_context_crawl4ai, fetch_url_content_crawl4ai
11
 
12
  # Load environment variables from .env file
13
  load_dotenv()
 
17
  import os
18
  import requests
19
  import json
20
+ import asyncio
21
+ from crawl4ai import AsyncWebCrawler
22
 
23
  # Configuration
24
  SPACE_NAME = "{name}"
 
30
  # Get API key from environment - customizable variable name
31
  API_KEY = os.environ.get("{api_key_var}")
32
 
33
+ async def fetch_url_content_async(url, crawler):
34
+ """Fetch and extract text content from a URL using Crawl4AI"""
35
  try:
36
+ result = await crawler.arun(
37
+ url=url,
38
+ bypass_cache=True,
39
+ word_count_threshold=10,
40
+ excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
41
+ remove_overlay_elements=True
42
+ )
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ if result.success:
45
+ content = result.markdown or result.cleaned_html or ""
46
+ # Truncate to ~4000 characters
47
+ if len(content) > 4000:
48
+ content = content[:4000] + "..."
49
+ return content
50
+ else:
51
+ return f"Error fetching {{url}}: Failed to retrieve content"
52
+ except Exception as e:
53
+ return f"Error fetching {{url}}: {{str(e)}}"
54
+
55
+ def fetch_url_content(url):
56
+ """Synchronous wrapper for URL fetching"""
57
+ async def fetch():
58
+ async with AsyncWebCrawler(verbose=False) as crawler:
59
+ return await fetch_url_content_async(url, crawler)
60
+
61
+ try:
62
+ return asyncio.run(fetch())
63
  except Exception as e:
64
  return f"Error fetching {{url}}: {{str(e)}}"
65
 
 
147
  # Available models
148
  MODELS = [
149
  "google/gemma-3-27b-it",
 
 
 
 
 
 
 
150
  "google/gemini-2.0-flash-001",
151
  "mistralai/mistral-medium",
152
  "openai/gpt-4o-nano",
153
  "anthropic/claude-3.5-haiku"
 
154
  ]
155
 
156
  def fetch_url_content(url):
 
282
 
283
  def create_requirements():
284
  """Generate requirements.txt"""
285
+ return "gradio==4.44.1\nrequests==2.32.3\ncrawl4ai==0.4.245"
286
 
287
  def generate_zip(name, description, system_prompt, model, api_key_var, temperature, max_tokens, examples_text, url1="", url2="", url3="", url4=""):
288
  """Generate deployable zip file"""
 
386
  chat_history.append([message, response])
387
  return "", chat_history
388
 
389
+ # Get grounding context from URLs using Crawl4AI
390
  grounding_urls = [url1, url2, url3, url4]
391
+ grounding_context = get_grounding_context_crawl4ai(grounding_urls)
392
 
393
  # Build enhanced system prompt with grounding context
394
  base_system_prompt = """You are an expert assistant specializing in Gradio configurations for HuggingFace Spaces. You have deep knowledge of:
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- gradio==4.32.0
2
- requests==2.32.3
3
- beautifulsoup4==4.12.3
4
- python-dotenv==1.0.0
 
 
1
+ gradio>=4.44.0
2
+ requests>=2.32.3
3
+ beautifulsoup4>=4.12.3
4
+ python-dotenv>=1.0.0
5
+ crawl4ai>=0.4.245
scraping_service.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from crawl4ai import AsyncWebCrawler
3
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
4
+ import json
5
+ from typing import List, Dict, Optional
6
+
7
+ class Crawl4AIScraper:
8
+ """Web scraping service using Crawl4AI for better content extraction"""
9
+
10
+ def __init__(self):
11
+ self.crawler = None
12
+
13
+ async def __aenter__(self):
14
+ """Initialize the crawler when entering async context"""
15
+ self.crawler = AsyncWebCrawler(verbose=False)
16
+ await self.crawler.__aenter__()
17
+ return self
18
+
19
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
20
+ """Clean up the crawler when exiting async context"""
21
+ if self.crawler:
22
+ await self.crawler.__aexit__(exc_type, exc_val, exc_tb)
23
+
24
+ async def scrape_url(self, url: str, max_chars: int = 4000) -> str:
25
+ """
26
+ Scrape a single URL and extract text content
27
+
28
+ Args:
29
+ url: The URL to scrape
30
+ max_chars: Maximum characters to return (default 4000)
31
+
32
+ Returns:
33
+ Extracted text content or error message
34
+ """
35
+ try:
36
+ # Perform the crawl
37
+ result = await self.crawler.arun(
38
+ url=url,
39
+ bypass_cache=True,
40
+ word_count_threshold=10,
41
+ excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
42
+ remove_overlay_elements=True
43
+ )
44
+
45
+ if result.success:
46
+ # Get cleaned text content - prefer markdown over cleaned_html
47
+ content = result.markdown or result.cleaned_html or ""
48
+
49
+ # Truncate if needed
50
+ if len(content) > max_chars:
51
+ content = content[:max_chars] + "..."
52
+
53
+ return content
54
+ else:
55
+ return f"Error fetching {url}: Failed to retrieve content"
56
+
57
+ except Exception as e:
58
+ return f"Error fetching {url}: {str(e)}"
59
+
60
+ async def scrape_multiple_urls(self, urls: List[str], max_chars_per_url: int = 4000) -> Dict[str, str]:
61
+ """
62
+ Scrape multiple URLs concurrently
63
+
64
+ Args:
65
+ urls: List of URLs to scrape
66
+ max_chars_per_url: Maximum characters per URL
67
+
68
+ Returns:
69
+ Dictionary mapping URLs to their content
70
+ """
71
+ tasks = [self.scrape_url(url, max_chars_per_url) for url in urls if url and url.strip()]
72
+ results = await asyncio.gather(*tasks, return_exceptions=True)
73
+
74
+ url_content = {}
75
+ for url, result in zip(urls, results):
76
+ if isinstance(result, Exception):
77
+ url_content[url] = f"Error fetching {url}: {str(result)}"
78
+ else:
79
+ url_content[url] = result
80
+
81
+ return url_content
82
+
83
+ def get_grounding_context_crawl4ai(urls: List[str]) -> str:
84
+ """
85
+ Synchronous wrapper to fetch grounding context using Crawl4AI
86
+
87
+ Args:
88
+ urls: List of URLs to fetch context from
89
+
90
+ Returns:
91
+ Formatted grounding context string
92
+ """
93
+ if not urls:
94
+ return ""
95
+
96
+ # Filter valid URLs
97
+ valid_urls = [url for url in urls if url and url.strip()]
98
+ if not valid_urls:
99
+ return ""
100
+
101
+ async def fetch_all():
102
+ async with Crawl4AIScraper() as scraper:
103
+ return await scraper.scrape_multiple_urls(valid_urls)
104
+
105
+ # Run the async function - handle existing event loop
106
+ try:
107
+ loop = asyncio.get_running_loop()
108
+ # We're already in an async context, create a new event loop in a thread
109
+ import concurrent.futures
110
+ with concurrent.futures.ThreadPoolExecutor() as executor:
111
+ future = executor.submit(asyncio.run, fetch_all())
112
+ url_content = future.result()
113
+ except RuntimeError:
114
+ # No event loop running, we can use asyncio.run directly
115
+ url_content = asyncio.run(fetch_all())
116
+ except Exception as e:
117
+ return f"Error initializing scraper: {str(e)}"
118
+
119
+ # Format the context
120
+ context_parts = []
121
+ for i, (url, content) in enumerate(url_content.items(), 1):
122
+ context_parts.append(f"Context from URL {i} ({url}):\n{content}")
123
+
124
+ if context_parts:
125
+ return "\n\n" + "\n\n".join(context_parts) + "\n\n"
126
+ return ""
127
+
128
+ # Backwards compatibility function
129
+ def fetch_url_content_crawl4ai(url: str) -> str:
130
+ """
131
+ Synchronous wrapper for single URL scraping (backwards compatibility)
132
+
133
+ Args:
134
+ url: The URL to fetch
135
+
136
+ Returns:
137
+ Extracted content or error message
138
+ """
139
+ async def fetch_one():
140
+ async with Crawl4AIScraper() as scraper:
141
+ return await scraper.scrape_url(url)
142
+
143
+ try:
144
+ return asyncio.run(fetch_one())
145
+ except Exception as e:
146
+ return f"Error fetching {url}: {str(e)}"