Spaces:

milwright
/

chatui-helper

Running

milwright commited on 6 days ago

Commit

f3a35a2

1 Parent(s): df94830

Integrate Crawl4AI for enhanced web scraping

- Replace BeautifulSoup with Crawl4AI for better content extraction
- Create dedicated scraping_service.py module with Crawl4AIScraper class
- Update Chat Support assistant to use Crawl4AI for grounding URLs
- Update deployment package template to include Crawl4AI
- Handle async/sync conversion for Gradio compatibility
- Update requirements.txt with flexible version constraints

Files changed (3) hide show

app.py +34 -34
requirements.txt +5 -4
scraping_service.py +146 -0

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datetime import datetime
 from dotenv import load_dotenv
 import requests
 from bs4 import BeautifulSoup
 # Load environment variables from .env file
 load_dotenv()
@@ -16,7 +17,8 @@ SPACE_TEMPLATE = '''import gradio as gr
 import os
 import requests
 import json
-from bs4 import BeautifulSoup
 # Configuration
 SPACE_NAME = "{name}"
@@ -28,30 +30,36 @@ GROUNDING_URLS = {grounding_urls}
 # Get API key from environment - customizable variable name
 API_KEY = os.environ.get("{api_key_var}")
-def fetch_url_content(url):
-    """Fetch and extract text content from a URL"""
     try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # Remove script and style elements
-        for script in soup(["script", "style"]):
-            script.decompose()
-        # Get text content
-        text = soup.get_text()
-        # Clean up whitespace
-        lines = (line.strip() for line in text.splitlines())
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        text = ' '.join(chunk for chunk in chunks if chunk)
-        # Truncate to ~4000 characters
-        if len(text) > 4000:
-            text = text[:4000] + "..."
-        return text
     except Exception as e:
         return f"Error fetching {{url}}: {{str(e)}}"
@@ -139,18 +147,10 @@ if __name__ == "__main__":
 # Available models
 MODELS = [
     "google/gemma-3-27b-it",
-<<<<<<< HEAD
-    "mistralai/mixtral-8x7b-instruct",
-    "meta-llama/llama-3.1-70b-instruct",
-    "anthropic/claude-3.5-haiku",
-    "nvidia/nemotron-4-340b-instruct",
-    "openai/gpt-3.5-turbo"
-=======
     "google/gemini-2.0-flash-001",
     "mistralai/mistral-medium",
     "openai/gpt-4o-nano",
     "anthropic/claude-3.5-haiku"
->>>>>>> c997ea6 (Update model selection to five current models and remove cost information)
 ]
 def fetch_url_content(url):
@@ -282,7 +282,7 @@ Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} with Chat U/I Helper
 def create_requirements():
     """Generate requirements.txt"""
-    return "gradio==4.44.1\nrequests==2.32.3\nbeautifulsoup4==4.12.3"
 def generate_zip(name, description, system_prompt, model, api_key_var, temperature, max_tokens, examples_text, url1="", url2="", url3="", url4=""):
     """Generate deployable zip file"""
@@ -386,9 +386,9 @@ def respond(message, chat_history, url1="", url2="", url3="", url4=""):
         chat_history.append([message, response])
         return "", chat_history
-    # Get grounding context from URLs
     grounding_urls = [url1, url2, url3, url4]
-    grounding_context = get_grounding_context(grounding_urls)
     # Build enhanced system prompt with grounding context
     base_system_prompt = """You are an expert assistant specializing in Gradio configurations for HuggingFace Spaces. You have deep knowledge of:

 from dotenv import load_dotenv
 import requests
 from bs4 import BeautifulSoup
+from scraping_service import get_grounding_context_crawl4ai, fetch_url_content_crawl4ai
 # Load environment variables from .env file
 load_dotenv()
 import os
 import requests
 import json
+import asyncio
+from crawl4ai import AsyncWebCrawler
 # Configuration
 SPACE_NAME = "{name}"
 # Get API key from environment - customizable variable name
 API_KEY = os.environ.get("{api_key_var}")
+async def fetch_url_content_async(url, crawler):
+    """Fetch and extract text content from a URL using Crawl4AI"""
     try:
+        result = await crawler.arun(
+            url=url,
+            bypass_cache=True,
+            word_count_threshold=10,
+            excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
+            remove_overlay_elements=True
+        )
+        if result.success:
+            content = result.markdown or result.cleaned_html or ""
+            # Truncate to ~4000 characters
+            if len(content) > 4000:
+                content = content[:4000] + "..."
+            return content
+        else:
+            return f"Error fetching {{url}}: Failed to retrieve content"
+    except Exception as e:
+        return f"Error fetching {{url}}: {{str(e)}}"
+def fetch_url_content(url):
+    """Synchronous wrapper for URL fetching"""
+    async def fetch():
+        async with AsyncWebCrawler(verbose=False) as crawler:
+            return await fetch_url_content_async(url, crawler)
+    try:
+        return asyncio.run(fetch())
     except Exception as e:
         return f"Error fetching {{url}}: {{str(e)}}"
 # Available models
 MODELS = [
     "google/gemma-3-27b-it",
     "google/gemini-2.0-flash-001",
     "mistralai/mistral-medium",
     "openai/gpt-4o-nano",
     "anthropic/claude-3.5-haiku"
 ]
 def fetch_url_content(url):
 def create_requirements():
     """Generate requirements.txt"""
+    return "gradio==4.44.1\nrequests==2.32.3\ncrawl4ai==0.4.245"
 def generate_zip(name, description, system_prompt, model, api_key_var, temperature, max_tokens, examples_text, url1="", url2="", url3="", url4=""):
     """Generate deployable zip file"""
         chat_history.append([message, response])
         return "", chat_history
+    # Get grounding context from URLs using Crawl4AI
     grounding_urls = [url1, url2, url3, url4]
+    grounding_context = get_grounding_context_crawl4ai(grounding_urls)
     # Build enhanced system prompt with grounding context
     base_system_prompt = """You are an expert assistant specializing in Gradio configurations for HuggingFace Spaces. You have deep knowledge of:

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-gradio==4.32.0
-requests==2.32.3
-beautifulsoup4==4.12.3
-python-dotenv==1.0.0

+gradio>=4.44.0
+requests>=2.32.3
+beautifulsoup4>=4.12.3
+python-dotenv>=1.0.0
+crawl4ai>=0.4.245

scraping_service.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+import json
+from typing import List, Dict, Optional
+class Crawl4AIScraper:
+    """Web scraping service using Crawl4AI for better content extraction"""
+    def __init__(self):
+        self.crawler = None
+    async def __aenter__(self):
+        """Initialize the crawler when entering async context"""
+        self.crawler = AsyncWebCrawler(verbose=False)
+        await self.crawler.__aenter__()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Clean up the crawler when exiting async context"""
+        if self.crawler:
+            await self.crawler.__aexit__(exc_type, exc_val, exc_tb)
+    async def scrape_url(self, url: str, max_chars: int = 4000) -> str:
+        """
+        Scrape a single URL and extract text content
+        Args:
+            url: The URL to scrape
+            max_chars: Maximum characters to return (default 4000)
+        Returns:
+            Extracted text content or error message
+        """
+        try:
+            # Perform the crawl
+            result = await self.crawler.arun(
+                url=url,
+                bypass_cache=True,
+                word_count_threshold=10,
+                excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
+                remove_overlay_elements=True
+            )
+            if result.success:
+                # Get cleaned text content - prefer markdown over cleaned_html
+                content = result.markdown or result.cleaned_html or ""
+                # Truncate if needed
+                if len(content) > max_chars:
+                    content = content[:max_chars] + "..."
+                return content
+            else:
+                return f"Error fetching {url}: Failed to retrieve content"
+        except Exception as e:
+            return f"Error fetching {url}: {str(e)}"
+    async def scrape_multiple_urls(self, urls: List[str], max_chars_per_url: int = 4000) -> Dict[str, str]:
+        """
+        Scrape multiple URLs concurrently
+        Args:
+            urls: List of URLs to scrape
+            max_chars_per_url: Maximum characters per URL
+        Returns:
+            Dictionary mapping URLs to their content
+        """
+        tasks = [self.scrape_url(url, max_chars_per_url) for url in urls if url and url.strip()]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        url_content = {}
+        for url, result in zip(urls, results):
+            if isinstance(result, Exception):
+                url_content[url] = f"Error fetching {url}: {str(result)}"
+            else:
+                url_content[url] = result
+        return url_content
+def get_grounding_context_crawl4ai(urls: List[str]) -> str:
+    """
+    Synchronous wrapper to fetch grounding context using Crawl4AI
+    Args:
+        urls: List of URLs to fetch context from
+    Returns:
+        Formatted grounding context string
+    """
+    if not urls:
+        return ""
+    # Filter valid URLs
+    valid_urls = [url for url in urls if url and url.strip()]
+    if not valid_urls:
+        return ""
+    async def fetch_all():
+        async with Crawl4AIScraper() as scraper:
+            return await scraper.scrape_multiple_urls(valid_urls)
+    # Run the async function - handle existing event loop
+    try:
+        loop = asyncio.get_running_loop()
+        # We're already in an async context, create a new event loop in a thread
+        import concurrent.futures
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(asyncio.run, fetch_all())
+            url_content = future.result()
+    except RuntimeError:
+        # No event loop running, we can use asyncio.run directly
+        url_content = asyncio.run(fetch_all())
+    except Exception as e:
+        return f"Error initializing scraper: {str(e)}"
+    # Format the context
+    context_parts = []
+    for i, (url, content) in enumerate(url_content.items(), 1):
+        context_parts.append(f"Context from URL {i} ({url}):\n{content}")
+    if context_parts:
+        return "\n\n" + "\n\n".join(context_parts) + "\n\n"
+    return ""
+# Backwards compatibility function
+def fetch_url_content_crawl4ai(url: str) -> str:
+    """
+    Synchronous wrapper for single URL scraping (backwards compatibility)
+    Args:
+        url: The URL to fetch
+    Returns:
+        Extracted content or error message
+    """
+    async def fetch_one():
+        async with Crawl4AIScraper() as scraper:
+            return await scraper.scrape_url(url)
+    try:
+        return asyncio.run(fetch_one())
+    except Exception as e:
+        return f"Error fetching {url}: {str(e)}"