Spaces:
Running
Running
Integrate Crawl4AI for enhanced web scraping
Browse files- Replace BeautifulSoup with Crawl4AI for better content extraction
- Create dedicated scraping_service.py module with Crawl4AIScraper class
- Update Chat Support assistant to use Crawl4AI for grounding URLs
- Update deployment package template to include Crawl4AI
- Handle async/sync conversion for Gradio compatibility
- Update requirements.txt with flexible version constraints
- app.py +34 -34
- requirements.txt +5 -4
- scraping_service.py +146 -0
app.py
CHANGED
@@ -7,6 +7,7 @@ from datetime import datetime
|
|
7 |
from dotenv import load_dotenv
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
|
|
10 |
|
11 |
# Load environment variables from .env file
|
12 |
load_dotenv()
|
@@ -16,7 +17,8 @@ SPACE_TEMPLATE = '''import gradio as gr
|
|
16 |
import os
|
17 |
import requests
|
18 |
import json
|
19 |
-
|
|
|
20 |
|
21 |
# Configuration
|
22 |
SPACE_NAME = "{name}"
|
@@ -28,30 +30,36 @@ GROUNDING_URLS = {grounding_urls}
|
|
28 |
# Get API key from environment - customizable variable name
|
29 |
API_KEY = os.environ.get("{api_key_var}")
|
30 |
|
31 |
-
def
|
32 |
-
"""Fetch and extract text content from a URL"""
|
33 |
try:
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
# Get text content
|
43 |
-
text = soup.get_text()
|
44 |
-
|
45 |
-
# Clean up whitespace
|
46 |
-
lines = (line.strip() for line in text.splitlines())
|
47 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
48 |
-
text = ' '.join(chunk for chunk in chunks if chunk)
|
49 |
-
|
50 |
-
# Truncate to ~4000 characters
|
51 |
-
if len(text) > 4000:
|
52 |
-
text = text[:4000] + "..."
|
53 |
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
except Exception as e:
|
56 |
return f"Error fetching {{url}}: {{str(e)}}"
|
57 |
|
@@ -139,18 +147,10 @@ if __name__ == "__main__":
|
|
139 |
# Available models
|
140 |
MODELS = [
|
141 |
"google/gemma-3-27b-it",
|
142 |
-
<<<<<<< HEAD
|
143 |
-
"mistralai/mixtral-8x7b-instruct",
|
144 |
-
"meta-llama/llama-3.1-70b-instruct",
|
145 |
-
"anthropic/claude-3.5-haiku",
|
146 |
-
"nvidia/nemotron-4-340b-instruct",
|
147 |
-
"openai/gpt-3.5-turbo"
|
148 |
-
=======
|
149 |
"google/gemini-2.0-flash-001",
|
150 |
"mistralai/mistral-medium",
|
151 |
"openai/gpt-4o-nano",
|
152 |
"anthropic/claude-3.5-haiku"
|
153 |
-
>>>>>>> c997ea6 (Update model selection to five current models and remove cost information)
|
154 |
]
|
155 |
|
156 |
def fetch_url_content(url):
|
@@ -282,7 +282,7 @@ Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} with Chat U/I Helper
|
|
282 |
|
283 |
def create_requirements():
|
284 |
"""Generate requirements.txt"""
|
285 |
-
return "gradio==4.44.1\nrequests==2.32.3\
|
286 |
|
287 |
def generate_zip(name, description, system_prompt, model, api_key_var, temperature, max_tokens, examples_text, url1="", url2="", url3="", url4=""):
|
288 |
"""Generate deployable zip file"""
|
@@ -386,9 +386,9 @@ def respond(message, chat_history, url1="", url2="", url3="", url4=""):
|
|
386 |
chat_history.append([message, response])
|
387 |
return "", chat_history
|
388 |
|
389 |
-
# Get grounding context from URLs
|
390 |
grounding_urls = [url1, url2, url3, url4]
|
391 |
-
grounding_context =
|
392 |
|
393 |
# Build enhanced system prompt with grounding context
|
394 |
base_system_prompt = """You are an expert assistant specializing in Gradio configurations for HuggingFace Spaces. You have deep knowledge of:
|
|
|
7 |
from dotenv import load_dotenv
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
+
from scraping_service import get_grounding_context_crawl4ai, fetch_url_content_crawl4ai
|
11 |
|
12 |
# Load environment variables from .env file
|
13 |
load_dotenv()
|
|
|
17 |
import os
|
18 |
import requests
|
19 |
import json
|
20 |
+
import asyncio
|
21 |
+
from crawl4ai import AsyncWebCrawler
|
22 |
|
23 |
# Configuration
|
24 |
SPACE_NAME = "{name}"
|
|
|
30 |
# Get API key from environment - customizable variable name
|
31 |
API_KEY = os.environ.get("{api_key_var}")
|
32 |
|
33 |
+
async def fetch_url_content_async(url, crawler):
|
34 |
+
"""Fetch and extract text content from a URL using Crawl4AI"""
|
35 |
try:
|
36 |
+
result = await crawler.arun(
|
37 |
+
url=url,
|
38 |
+
bypass_cache=True,
|
39 |
+
word_count_threshold=10,
|
40 |
+
excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
|
41 |
+
remove_overlay_elements=True
|
42 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
if result.success:
|
45 |
+
content = result.markdown or result.cleaned_html or ""
|
46 |
+
# Truncate to ~4000 characters
|
47 |
+
if len(content) > 4000:
|
48 |
+
content = content[:4000] + "..."
|
49 |
+
return content
|
50 |
+
else:
|
51 |
+
return f"Error fetching {{url}}: Failed to retrieve content"
|
52 |
+
except Exception as e:
|
53 |
+
return f"Error fetching {{url}}: {{str(e)}}"
|
54 |
+
|
55 |
+
def fetch_url_content(url):
|
56 |
+
"""Synchronous wrapper for URL fetching"""
|
57 |
+
async def fetch():
|
58 |
+
async with AsyncWebCrawler(verbose=False) as crawler:
|
59 |
+
return await fetch_url_content_async(url, crawler)
|
60 |
+
|
61 |
+
try:
|
62 |
+
return asyncio.run(fetch())
|
63 |
except Exception as e:
|
64 |
return f"Error fetching {{url}}: {{str(e)}}"
|
65 |
|
|
|
147 |
# Available models
|
148 |
MODELS = [
|
149 |
"google/gemma-3-27b-it",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
"google/gemini-2.0-flash-001",
|
151 |
"mistralai/mistral-medium",
|
152 |
"openai/gpt-4o-nano",
|
153 |
"anthropic/claude-3.5-haiku"
|
|
|
154 |
]
|
155 |
|
156 |
def fetch_url_content(url):
|
|
|
282 |
|
283 |
def create_requirements():
|
284 |
"""Generate requirements.txt"""
|
285 |
+
return "gradio==4.44.1\nrequests==2.32.3\ncrawl4ai==0.4.245"
|
286 |
|
287 |
def generate_zip(name, description, system_prompt, model, api_key_var, temperature, max_tokens, examples_text, url1="", url2="", url3="", url4=""):
|
288 |
"""Generate deployable zip file"""
|
|
|
386 |
chat_history.append([message, response])
|
387 |
return "", chat_history
|
388 |
|
389 |
+
# Get grounding context from URLs using Crawl4AI
|
390 |
grounding_urls = [url1, url2, url3, url4]
|
391 |
+
grounding_context = get_grounding_context_crawl4ai(grounding_urls)
|
392 |
|
393 |
# Build enhanced system prompt with grounding context
|
394 |
base_system_prompt = """You are an expert assistant specializing in Gradio configurations for HuggingFace Spaces. You have deep knowledge of:
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
gradio
|
2 |
-
requests
|
3 |
-
beautifulsoup4
|
4 |
-
python-dotenv
|
|
|
|
1 |
+
gradio>=4.44.0
|
2 |
+
requests>=2.32.3
|
3 |
+
beautifulsoup4>=4.12.3
|
4 |
+
python-dotenv>=1.0.0
|
5 |
+
crawl4ai>=0.4.245
|
scraping_service.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from crawl4ai import AsyncWebCrawler
|
3 |
+
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
4 |
+
import json
|
5 |
+
from typing import List, Dict, Optional
|
6 |
+
|
7 |
+
class Crawl4AIScraper:
|
8 |
+
"""Web scraping service using Crawl4AI for better content extraction"""
|
9 |
+
|
10 |
+
def __init__(self):
|
11 |
+
self.crawler = None
|
12 |
+
|
13 |
+
async def __aenter__(self):
|
14 |
+
"""Initialize the crawler when entering async context"""
|
15 |
+
self.crawler = AsyncWebCrawler(verbose=False)
|
16 |
+
await self.crawler.__aenter__()
|
17 |
+
return self
|
18 |
+
|
19 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
20 |
+
"""Clean up the crawler when exiting async context"""
|
21 |
+
if self.crawler:
|
22 |
+
await self.crawler.__aexit__(exc_type, exc_val, exc_tb)
|
23 |
+
|
24 |
+
async def scrape_url(self, url: str, max_chars: int = 4000) -> str:
|
25 |
+
"""
|
26 |
+
Scrape a single URL and extract text content
|
27 |
+
|
28 |
+
Args:
|
29 |
+
url: The URL to scrape
|
30 |
+
max_chars: Maximum characters to return (default 4000)
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
Extracted text content or error message
|
34 |
+
"""
|
35 |
+
try:
|
36 |
+
# Perform the crawl
|
37 |
+
result = await self.crawler.arun(
|
38 |
+
url=url,
|
39 |
+
bypass_cache=True,
|
40 |
+
word_count_threshold=10,
|
41 |
+
excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
|
42 |
+
remove_overlay_elements=True
|
43 |
+
)
|
44 |
+
|
45 |
+
if result.success:
|
46 |
+
# Get cleaned text content - prefer markdown over cleaned_html
|
47 |
+
content = result.markdown or result.cleaned_html or ""
|
48 |
+
|
49 |
+
# Truncate if needed
|
50 |
+
if len(content) > max_chars:
|
51 |
+
content = content[:max_chars] + "..."
|
52 |
+
|
53 |
+
return content
|
54 |
+
else:
|
55 |
+
return f"Error fetching {url}: Failed to retrieve content"
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
return f"Error fetching {url}: {str(e)}"
|
59 |
+
|
60 |
+
async def scrape_multiple_urls(self, urls: List[str], max_chars_per_url: int = 4000) -> Dict[str, str]:
|
61 |
+
"""
|
62 |
+
Scrape multiple URLs concurrently
|
63 |
+
|
64 |
+
Args:
|
65 |
+
urls: List of URLs to scrape
|
66 |
+
max_chars_per_url: Maximum characters per URL
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Dictionary mapping URLs to their content
|
70 |
+
"""
|
71 |
+
tasks = [self.scrape_url(url, max_chars_per_url) for url in urls if url and url.strip()]
|
72 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
73 |
+
|
74 |
+
url_content = {}
|
75 |
+
for url, result in zip(urls, results):
|
76 |
+
if isinstance(result, Exception):
|
77 |
+
url_content[url] = f"Error fetching {url}: {str(result)}"
|
78 |
+
else:
|
79 |
+
url_content[url] = result
|
80 |
+
|
81 |
+
return url_content
|
82 |
+
|
83 |
+
def get_grounding_context_crawl4ai(urls: List[str]) -> str:
|
84 |
+
"""
|
85 |
+
Synchronous wrapper to fetch grounding context using Crawl4AI
|
86 |
+
|
87 |
+
Args:
|
88 |
+
urls: List of URLs to fetch context from
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
Formatted grounding context string
|
92 |
+
"""
|
93 |
+
if not urls:
|
94 |
+
return ""
|
95 |
+
|
96 |
+
# Filter valid URLs
|
97 |
+
valid_urls = [url for url in urls if url and url.strip()]
|
98 |
+
if not valid_urls:
|
99 |
+
return ""
|
100 |
+
|
101 |
+
async def fetch_all():
|
102 |
+
async with Crawl4AIScraper() as scraper:
|
103 |
+
return await scraper.scrape_multiple_urls(valid_urls)
|
104 |
+
|
105 |
+
# Run the async function - handle existing event loop
|
106 |
+
try:
|
107 |
+
loop = asyncio.get_running_loop()
|
108 |
+
# We're already in an async context, create a new event loop in a thread
|
109 |
+
import concurrent.futures
|
110 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
111 |
+
future = executor.submit(asyncio.run, fetch_all())
|
112 |
+
url_content = future.result()
|
113 |
+
except RuntimeError:
|
114 |
+
# No event loop running, we can use asyncio.run directly
|
115 |
+
url_content = asyncio.run(fetch_all())
|
116 |
+
except Exception as e:
|
117 |
+
return f"Error initializing scraper: {str(e)}"
|
118 |
+
|
119 |
+
# Format the context
|
120 |
+
context_parts = []
|
121 |
+
for i, (url, content) in enumerate(url_content.items(), 1):
|
122 |
+
context_parts.append(f"Context from URL {i} ({url}):\n{content}")
|
123 |
+
|
124 |
+
if context_parts:
|
125 |
+
return "\n\n" + "\n\n".join(context_parts) + "\n\n"
|
126 |
+
return ""
|
127 |
+
|
128 |
+
# Backwards compatibility function
|
129 |
+
def fetch_url_content_crawl4ai(url: str) -> str:
|
130 |
+
"""
|
131 |
+
Synchronous wrapper for single URL scraping (backwards compatibility)
|
132 |
+
|
133 |
+
Args:
|
134 |
+
url: The URL to fetch
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
Extracted content or error message
|
138 |
+
"""
|
139 |
+
async def fetch_one():
|
140 |
+
async with Crawl4AIScraper() as scraper:
|
141 |
+
return await scraper.scrape_url(url)
|
142 |
+
|
143 |
+
try:
|
144 |
+
return asyncio.run(fetch_one())
|
145 |
+
except Exception as e:
|
146 |
+
return f"Error fetching {url}: {str(e)}"
|