File size: 4,961 Bytes
f3a35a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
from typing import List, Dict, Optional

class Crawl4AIScraper:
    """Web scraping service using Crawl4AI for better content extraction"""
    
    def __init__(self):
        self.crawler = None
    
    async def __aenter__(self):
        """Initialize the crawler when entering async context"""
        self.crawler = AsyncWebCrawler(verbose=False)
        await self.crawler.__aenter__()
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Clean up the crawler when exiting async context"""
        if self.crawler:
            await self.crawler.__aexit__(exc_type, exc_val, exc_tb)
    
    async def scrape_url(self, url: str, max_chars: int = 4000) -> str:
        """
        Scrape a single URL and extract text content
        
        Args:
            url: The URL to scrape
            max_chars: Maximum characters to return (default 4000)
            
        Returns:
            Extracted text content or error message
        """
        try:
            # Perform the crawl
            result = await self.crawler.arun(
                url=url,
                bypass_cache=True,
                word_count_threshold=10,
                excluded_tags=['script', 'style', 'nav', 'header', 'footer'],
                remove_overlay_elements=True
            )
            
            if result.success:
                # Get cleaned text content - prefer markdown over cleaned_html
                content = result.markdown or result.cleaned_html or ""
                
                # Truncate if needed
                if len(content) > max_chars:
                    content = content[:max_chars] + "..."
                
                return content
            else:
                return f"Error fetching {url}: Failed to retrieve content"
                
        except Exception as e:
            return f"Error fetching {url}: {str(e)}"
    
    async def scrape_multiple_urls(self, urls: List[str], max_chars_per_url: int = 4000) -> Dict[str, str]:
        """
        Scrape multiple URLs concurrently
        
        Args:
            urls: List of URLs to scrape
            max_chars_per_url: Maximum characters per URL
            
        Returns:
            Dictionary mapping URLs to their content
        """
        tasks = [self.scrape_url(url, max_chars_per_url) for url in urls if url and url.strip()]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        url_content = {}
        for url, result in zip(urls, results):
            if isinstance(result, Exception):
                url_content[url] = f"Error fetching {url}: {str(result)}"
            else:
                url_content[url] = result
                
        return url_content

def get_grounding_context_crawl4ai(urls: List[str]) -> str:
    """
    Synchronous wrapper to fetch grounding context using Crawl4AI
    
    Args:
        urls: List of URLs to fetch context from
        
    Returns:
        Formatted grounding context string
    """
    if not urls:
        return ""
    
    # Filter valid URLs
    valid_urls = [url for url in urls if url and url.strip()]
    if not valid_urls:
        return ""
    
    async def fetch_all():
        async with Crawl4AIScraper() as scraper:
            return await scraper.scrape_multiple_urls(valid_urls)
    
    # Run the async function - handle existing event loop
    try:
        loop = asyncio.get_running_loop()
        # We're already in an async context, create a new event loop in a thread
        import concurrent.futures
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(asyncio.run, fetch_all())
            url_content = future.result()
    except RuntimeError:
        # No event loop running, we can use asyncio.run directly
        url_content = asyncio.run(fetch_all())
    except Exception as e:
        return f"Error initializing scraper: {str(e)}"
    
    # Format the context
    context_parts = []
    for i, (url, content) in enumerate(url_content.items(), 1):
        context_parts.append(f"Context from URL {i} ({url}):\n{content}")
    
    if context_parts:
        return "\n\n" + "\n\n".join(context_parts) + "\n\n"
    return ""

# Backwards compatibility function
def fetch_url_content_crawl4ai(url: str) -> str:
    """
    Synchronous wrapper for single URL scraping (backwards compatibility)
    
    Args:
        url: The URL to fetch
        
    Returns:
        Extracted content or error message
    """
    async def fetch_one():
        async with Crawl4AIScraper() as scraper:
            return await scraper.scrape_url(url)
    
    try:
        return asyncio.run(fetch_one())
    except Exception as e:
        return f"Error fetching {url}: {str(e)}"