Spaces:

hellorahulk
/

crawlit

Sleeping

App Files Files Community

hellorahulk commited on Feb 5

Commit

0034b95

verified ·

1 Parent(s): a87d676

Upload 2 files

Browse files

Files changed (2) hide show

app.py +641 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,641 @@

+"""
+Crawl4AI Demo Application
+========================
+This application provides a web interface and API for the Crawl4AI library, allowing users to extract
+content from web pages using different crawling strategies.
+Features:
+---------
+- Web interface built with Gradio for interactive use
+- RESTful API endpoint for programmatic access
+- Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
+- Configurable word count threshold
+- Markdown output with metadata
+Usage:
+------
+1. Start the server:
+   ```
+   python app.py
+   ```
+2. Access the web interface at http://localhost:8000
+3. Use the API endpoint at http://localhost:8000/api/crawl
+API Example:
+-----------
+```python
+import requests
+response = requests.post(
+    "http://localhost:8000/api/crawl",
+    json={
+        "url": "https://example.com",
+        "crawler_type": "basic",
+        "word_count_threshold": 100
+    }
+)
+result = response.json()
+```
+Dependencies:
+------------
+- gradio
+- fastapi
+- crawl4ai
+- uvicorn
+"""
+import gradio as gr
+import asyncio
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from enum import Enum
+from typing import Optional, Dict, Any, List, Set
+from contextlib import asynccontextmanager
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from playwright.async_api import async_playwright
+import urllib.parse
+class CrawlerType(str, Enum):
+    """
+    Enumeration of supported crawler types.
+    Attributes:
+        BASIC (str): Simple HTML parsing and content extraction
+        LLM (str): Language model-based content extraction
+        COSINE (str): Cosine similarity-based content extraction
+        JSON_CSS (str): JSON/CSS selector-based content extraction
+    """
+    BASIC = "basic"
+    LLM = "llm"
+    COSINE = "cosine"
+    JSON_CSS = "json_css"
+class ExtractionType(str, Enum):
+    """
+    Enumeration of supported extraction strategies.
+    Attributes:
+        DEFAULT (str): Default extraction without specific strategy
+        CSS (str): CSS selector-based extraction
+        XPATH (str): XPath-based extraction
+        LLM (str): Language model-based extraction
+        COMBINED (str): Combined strategy using multiple approaches
+    """
+    DEFAULT = "default"
+    CSS = "css"
+    XPATH = "xpath"
+    LLM = "llm"
+    COMBINED = "combined"
+class CrawlRequest(BaseModel):
+    """
+    Request model for crawling operations.
+    Attributes:
+        url (str): The URL to crawl
+        crawler_type (CrawlerType): The type of crawler to use
+        extraction_type (ExtractionType): The extraction strategy to use
+        word_count_threshold (int): Minimum word count for extracted content
+        css_selector (Optional[str]): CSS selector for content extraction
+        xpath_query (Optional[str]): XPath query for content extraction
+        excluded_tags (Optional[list]): HTML tags to exclude from extraction
+        scan_full_page (bool): Whether to scan the entire page for lazy-loaded content
+        scroll_delay (float): Delay between scroll steps in seconds
+        crawl_subpages (bool): Whether to crawl sub-pages found in links
+        max_depth (int): Maximum depth for recursive crawling (1 = only direct links)
+        exclude_external_links (bool): Whether to exclude links to external domains
+        max_pages (int): Maximum number of pages to crawl
+    """
+    url: str
+    crawler_type: CrawlerType = CrawlerType.BASIC
+    extraction_type: ExtractionType = ExtractionType.DEFAULT
+    word_count_threshold: int = 100
+    css_selector: Optional[str] = None
+    xpath_query: Optional[str] = None
+    excluded_tags: Optional[list] = None
+    scan_full_page: bool = False
+    scroll_delay: float = 0.5
+    crawl_subpages: bool = False
+    max_depth: int = 1
+    exclude_external_links: bool = True
+    max_pages: int = 10
+# Global crawler variable
+crawler = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager for FastAPI application.
+    Handles crawler initialization and cleanup.
+    """
+    global crawler
+    # Initialize browser configuration
+    browser_config = BrowserConfig(
+        headless=True,
+        viewport_width=1920,
+        viewport_height=1080
+    )
+    # Create and initialize crawler
+    try:
+        crawler = AsyncWebCrawler(config=browser_config)
+        print("Crawler initialized successfully")
+        yield
+    finally:
+        if crawler:
+            await crawler.close()
+            print("Crawler resources cleaned up")
+# Create FastAPI app with lifespan handler
+app = FastAPI(
+    title="Crawl4AI Demo",
+    description="A web interface and API for extracting content from web pages using Crawl4AI",
+    version="1.0.0",
+    lifespan=lifespan
+)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize the browser on startup"""
+    try:
+        async with async_playwright() as playwright:
+            await crawler.initialize(playwright)
+    except Exception as e:
+        print(f"Error initializing browser: {e}")
+        raise
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Clean up browser resources on shutdown"""
+    try:
+        await crawler.cleanup()
+    except Exception as e:
+        print(f"Error during cleanup: {e}")
+def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
+    """
+    Create an extraction strategy based on the specified type.
+    Args:
+        extraction_type (ExtractionType): The type of extraction strategy
+        css_selector (Optional[str]): CSS selector for content extraction
+        xpath_query (Optional[str]): XPath query for content extraction
+    Returns:
+        Any: The configured extraction strategy
+    """
+    if extraction_type == ExtractionType.CSS and css_selector:
+        schema = {
+            "name": "Content",
+            "baseSelector": css_selector,
+            "fields": [
+                {"name": "title", "selector": "h1,h2", "type": "text"},
+                {"name": "text", "selector": "p", "type": "text"},
+                {"name": "links", "selector": "a", "type": "attribute", "attribute": "href"}
+            ]
+        }
+        return JsonCssExtractionStrategy(schema)
+    return None
+async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
+    """
+    Recursively crawl pages including sub-pages up to the specified depth.
+    """
+    if visited is None:
+        visited = set()
+    if current_depth > request.max_depth or len(visited) >= request.max_pages:
+        return None
+    # Normalize URL to avoid duplicates
+    normalized_url = urllib.parse.urljoin(request.url, '/')
+    if normalized_url in visited:
+        return None
+    # Create run configuration for current page
+    run_config = CrawlerRunConfig(
+        # Core settings
+        cache_mode=CacheMode.BYPASS,
+        verbose=True,  # Enable verbose logging
+        # Content settings
+        word_count_threshold=request.word_count_threshold,
+        css_selector=request.css_selector,
+        excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
+        exclude_external_links=request.exclude_external_links,
+        # Page & JS settings
+        wait_for=f"css:{request.css_selector}" if request.css_selector else None,
+        wait_for_images=True,
+        page_timeout=30000,
+        # Lazy loading settings
+        scan_full_page=request.scan_full_page,
+        scroll_delay=request.scroll_delay,
+        # Extraction settings
+        extraction_strategy=create_extraction_strategy(
+            request.extraction_type,
+            request.css_selector,
+            request.xpath_query
+        )
+    )
+    browser_config = BrowserConfig(
+        headless=True,
+        viewport_width=1920,
+        viewport_height=1080
+    )
+    results = {
+        "pages": [],
+        "total_links": 0,
+        "visited_pages": len(visited)
+    }
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(url=request.url, config=run_config)
+            if not result.success:
+                print(f"Failed to crawl {request.url}: {result.error_message}")
+                return None
+            # Add current page result
+            page_result = {
+                "url": request.url,
+                "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
+                "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
+                "depth": current_depth
+            }
+            results["pages"].append(page_result)
+            visited.add(normalized_url)
+            # Process sub-pages if enabled
+            if request.crawl_subpages and hasattr(result, 'links'):
+                internal_links = result.links.get("internal", [])
+                if internal_links:
+                    results["total_links"] += len(internal_links)
+                    for link in internal_links:
+                        if len(visited) >= request.max_pages:
+                            break
+                        # Normalize and validate the link
+                        try:
+                            normalized_link = urllib.parse.urljoin(request.url, link)
+                            link_domain = urllib.parse.urlparse(normalized_link).netloc
+                            # Skip if already visited or external link
+                            if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
+                                continue
+                            # Create new request for sub-page
+                            sub_request = CrawlRequest(
+                                **{**request.dict(), "url": normalized_link}
+                            )
+                            # Recursively crawl sub-page
+                            sub_result = await crawl_with_subpages(
+                                sub_request,
+                                base_url,
+                                current_depth + 1,
+                                visited
+                            )
+                            if sub_result:
+                                results["pages"].extend(sub_result["pages"])
+                                results["total_links"] += sub_result["total_links"]
+                                results["visited_pages"] = len(visited)
+                        except Exception as e:
+                            print(f"Error processing link {link}: {str(e)}")
+                            continue
+            return results
+    except Exception as e:
+        print(f"Error crawling {request.url}: {str(e)}")
+        return None
+@app.post("/api/crawl")
+async def crawl_url(request: CrawlRequest):
+    """
+    API endpoint to crawl a URL and return the extracted content.
+    """
+    try:
+        base_url = urllib.parse.urlparse(request.url).netloc
+        if request.crawl_subpages:
+            results = await crawl_with_subpages(request, base_url)
+            if not results or not results["pages"]:
+                raise HTTPException(status_code=500, detail=f"Failed to crawl pages starting from {request.url}")
+            # Combine results from all pages
+            combined_markdown = "\\n\\n---\\n\\n".join(
+                f"## Page: {page['url']}\\n{page['markdown']}"
+                for page in results["pages"]
+            )
+            return {
+                "markdown": combined_markdown,
+                "metadata": {
+                    "url": request.url,
+                    "crawler_type": request.crawler_type.value,
+                    "extraction_type": request.extraction_type.value,
+                    "word_count_threshold": request.word_count_threshold,
+                    "css_selector": request.css_selector,
+                    "xpath_query": request.xpath_query,
+                    "scan_full_page": request.scan_full_page,
+                    "scroll_delay": request.scroll_delay,
+                    "total_pages_crawled": results["visited_pages"],
+                    "total_links_found": results["total_links"],
+                    "max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"]))
+                },
+                "pages": results["pages"]
+            }
+        else:
+            # Format wait_for condition properly if CSS selector is provided
+            wait_condition = f"css:{request.css_selector}" if request.css_selector else None
+            # Create run configuration
+            run_config = CrawlerRunConfig(
+                # Core settings
+                cache_mode=CacheMode.BYPASS,
+                # Content settings
+                word_count_threshold=request.word_count_threshold,
+                css_selector=request.css_selector,
+                excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
+                # Page & JS settings
+                wait_for=wait_condition,  # Using properly formatted wait condition
+                wait_for_images=True,     # Always wait for images to load
+                page_timeout=30000,       # 30 seconds timeout for page operations
+                # Lazy loading settings
+                scan_full_page=request.scan_full_page,
+                scroll_delay=request.scroll_delay,
+                # Extraction settings
+                extraction_strategy=create_extraction_strategy(
+                    request.extraction_type,
+                    request.css_selector,
+                    request.xpath_query
+                )
+            )
+            # Create browser config with optimized settings
+            browser_config = BrowserConfig(
+                headless=True,
+                viewport_width=1920,
+                viewport_height=1080
+            )
+            async with AsyncWebCrawler(config=browser_config) as temp_crawler:
+                try:
+                    result = await temp_crawler.arun(
+                        url=request.url,
+                        config=run_config
+                    )
+                    if not result.success:
+                        raise HTTPException(status_code=500, detail=result.error_message)
+                    # Get image information
+                    images = result.media.get("images", []) if hasattr(result, 'media') else []
+                    image_info = "\n### Images Found\n" if images else ""
+                    for i, img in enumerate(images[:5]):  # Show first 5 images
+                        image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
+                        if img.get('alt'):
+                            image_info += f"  Alt: {img['alt']}\n"
+                        if img.get('score'):
+                            image_info += f"  Score: {img['score']}\n"
+                    return {
+                        "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
+                        "metadata": {
+                            "url": request.url,
+                            "crawler_type": request.crawler_type.value,
+                            "extraction_type": request.extraction_type.value,
+                            "word_count_threshold": request.word_count_threshold,
+                            "css_selector": request.css_selector,
+                            "xpath_query": request.xpath_query,
+                            "scan_full_page": request.scan_full_page,
+                            "scroll_delay": request.scroll_delay,
+                            "wait_condition": wait_condition
+                        },
+                        "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
+                        "image_info": image_info
+                    }
+                except Exception as e:
+                    # More specific error handling
+                    error_msg = str(e)
+                    if "Wait condition failed" in error_msg:
+                        error_msg = f"Failed to find element matching selector '{request.css_selector}'. Please check if the selector is correct."
+                    elif "TimeoutError" in error_msg:
+                        error_msg = "Page took too long to load. Please try again or check the URL."
+                    raise HTTPException(status_code=500, detail=error_msg)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+async def gradio_crawl(
+    url: str,
+    crawler_type: str,
+    extraction_type: str,
+    word_count_threshold: int,
+    css_selector: str,
+    xpath_query: str,
+    scan_full_page: bool,
+    scroll_delay: float,
+    crawl_subpages: bool,
+    max_depth: int,
+    max_pages: int,
+    exclude_external_links: bool
+) -> tuple[str, str]:
+    """
+    Gradio interface function to handle crawling requests from the web UI.
+    Args:
+        url (str): The webpage URL to crawl
+        crawler_type (str): Type of crawler to use
+        extraction_type (str): Type of extraction strategy
+        word_count_threshold (int): Minimum word count threshold
+        css_selector (str): CSS selector for content targeting
+        xpath_query (str): XPath query for content targeting
+        scan_full_page (bool): Whether to scan the full page
+        scroll_delay (float): Delay between scroll steps
+        crawl_subpages (bool): Whether to crawl sub-pages
+        max_depth (int): Maximum crawl depth
+        max_pages (int): Maximum number of pages to crawl
+        exclude_external_links (bool): Whether to exclude external links
+    Returns:
+        tuple[str, str]: Tuple containing (markdown_content, metadata_string)
+    """
+    request = CrawlRequest(
+        url=url,
+        crawler_type=CrawlerType(crawler_type.lower()),
+        extraction_type=ExtractionType(extraction_type.lower()),
+        word_count_threshold=word_count_threshold,
+        css_selector=css_selector if css_selector else None,
+        xpath_query=xpath_query if xpath_query else None,
+        scan_full_page=scan_full_page,
+        scroll_delay=scroll_delay,
+        crawl_subpages=crawl_subpages,
+        max_depth=max_depth,
+        max_pages=max_pages,
+        exclude_external_links=exclude_external_links
+    )
+    try:
+        result = await crawl_url(request)
+        # Convert markdown result to string if it exists
+        markdown_content = str(result["markdown"]) if result.get("markdown") else ""
+        # Format the metadata and results
+        metadata_str = f"""### Metadata
+- URL: {result['metadata']['url']}
+- Crawler Type: {result['metadata']['crawler_type']}
+- Extraction Type: {result['metadata']['extraction_type']}
+- Word Count Threshold: {result['metadata']['word_count_threshold']}
+- CSS Selector: {result['metadata']['css_selector'] or 'None'}
+- XPath Query: {result['metadata']['xpath_query'] or 'None'}
+- Full Page Scan: {result['metadata']['scan_full_page']}
+- Scroll Delay: {result['metadata']['scroll_delay']}s"""
+        # Add sub-page crawling information if enabled
+        if crawl_subpages:
+            metadata_str += f"""
+- Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
+- Total Links Found: {result['metadata'].get('total_links_found', 0)}
+- Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
+        # Add image information if available
+        if result.get('image_info'):
+            metadata_str += f"\n\n{result['image_info']}"
+        # Add extracted content if available
+        if result.get("extracted_content"):
+            metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
+        return markdown_content, metadata_str
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        return error_msg, "Error occurred while crawling"
+# Create Gradio interface with enhanced documentation
+demo = gr.Interface(
+    fn=gradio_crawl,
+    inputs=[
+        gr.Textbox(
+            label="URL",
+            placeholder="Enter URL to crawl",
+            info="The webpage URL to extract content from"
+        ),
+        gr.Dropdown(
+            choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
+            label="Crawler Type",
+            value="Basic",
+            info="Select the content extraction strategy"
+        ),
+        gr.Dropdown(
+            choices=["Default", "CSS", "XPath", "LLM", "Combined"],
+            label="Extraction Type",
+            value="Default",
+            info="Choose how to extract content from the page"
+        ),
+        gr.Slider(
+            minimum=50,
+            maximum=500,
+            value=100,
+            step=50,
+            label="Word Count Threshold",
+            info="Minimum number of words required for content extraction"
+        ),
+        gr.Textbox(
+            label="CSS Selector",
+            placeholder="e.g., article.content, main.post",
+            info="CSS selector to target specific content (used with CSS extraction type)"
+        ),
+        gr.Textbox(
+            label="XPath Query",
+            placeholder="e.g., //article[@class='content']",
+            info="XPath query to target specific content (used with XPath extraction type)"
+        ),
+        gr.Checkbox(
+            label="Scan Full Page",
+            value=False,
+            info="Enable to scroll through the entire page to load lazy content"
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=2.0,
+            value=0.5,
+            step=0.1,
+            label="Scroll Delay",
+            info="Delay between scroll steps in seconds when scanning full page"
+        ),
+        gr.Checkbox(
+            label="Crawl Sub-pages",
+            value=False,
+            info="Enable to crawl links found on the page"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=5,
+            value=1,
+            step=1,
+            label="Max Crawl Depth",
+            info="Maximum depth for recursive crawling (1 = only direct links)"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=50,
+            value=10,
+            step=5,
+            label="Max Pages",
+            info="Maximum number of pages to crawl"
+        ),
+        gr.Checkbox(
+            label="Exclude External Links",
+            value=True,
+            info="Only crawl links within the same domain"
+        )
+    ],
+    outputs=[
+        gr.Markdown(label="Generated Markdown"),
+        gr.Markdown(label="Metadata & Extraction Results")
+    ],
+    title="Crawl4AI Demo",
+    description="""
+    This demo allows you to extract content from web pages using different crawling and extraction strategies.
+    1. Enter a URL to crawl
+    2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS)
+    3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
+    4. Configure additional options:
+       - Word count threshold for content filtering
+       - CSS selectors for targeting specific content
+       - XPath queries for precise extraction
+       - Full page scanning for lazy-loaded content
+       - Scroll delay for controlling page scanning speed
+       - Sub-page crawling with depth control
+       - Maximum number of pages to crawl
+       - External link filtering
+    The extracted content will be displayed in markdown format along with metadata and extraction results.
+    When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
+    """
+)
+# Mount Gradio app to FastAPI
+app = gr.mount_gradio_app(app, demo, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+crawl4ai>=0.4.3b0
+fastapi>=0.104.1
+uvicorn>=0.24.0
+gradio==4.0.0
+python-dotenv>=1.0.0
+pydantic>=2.5.0
+aiofiles==23.2.1