""" Crawl4AI Demo Application (Docker Version) ======================================= This is a modified version of the Crawl4AI demo application specifically designed for deployment in a Docker container on Hugging Face Spaces. Features: --------- - Web interface built with Gradio for interactive use - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS) - Configurable word count threshold - Markdown output with metadata - Sub-page crawling capabilities - Lazy loading support - Docker-optimized configuration """ import gradio as gr import asyncio from typing import Optional, Dict, Any, List, Set from enum import Enum from pydantic import BaseModel import os # Set crawl4ai database path before importing crawl4ai os.environ["CRAWL4AI_DB_PATH"] = "/home/crawler/.crawl4ai" from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy import urllib.parse # Configure browser settings for Docker environment CHROME_PATH = "/usr/bin/google-chrome-stable" os.environ["CHROME_PATH"] = CHROME_PATH class CrawlerType(str, Enum): """Enumeration of supported crawler types.""" BASIC = "basic" LLM = "llm" COSINE = "cosine" JSON_CSS = "json_css" class ExtractionType(str, Enum): """Enumeration of supported extraction strategies.""" DEFAULT = "default" CSS = "css" XPATH = "xpath" LLM = "llm" COMBINED = "combined" class CrawlRequest(BaseModel): """Request model for crawling operations.""" url: str crawler_type: CrawlerType = CrawlerType.BASIC extraction_type: ExtractionType = ExtractionType.DEFAULT word_count_threshold: int = 100 css_selector: Optional[str] = None xpath_query: Optional[str] = None excluded_tags: Optional[list] = None scan_full_page: bool = False scroll_delay: float = 0.5 crawl_subpages: bool = False max_depth: int = 1 exclude_external_links: bool = True max_pages: int = 10 def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any: """Create an extraction strategy based on the specified type.""" if extraction_type == ExtractionType.CSS and css_selector: schema = { "name": "Content", "baseSelector": css_selector, "fields": [ {"name": "title", "selector": "h1,h2", "type": "text"}, {"name": "text", "selector": "p", "type": "text"}, {"name": "links", "selector": "a", "type": "attribute", "attribute": "href"} ] } return JsonCssExtractionStrategy(schema) return None async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict: """Recursively crawl pages including sub-pages up to the specified depth.""" if visited is None: visited = set() if current_depth > request.max_depth or len(visited) >= request.max_pages: return None normalized_url = urllib.parse.urljoin(request.url, '/') if normalized_url in visited: return None run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, verbose=True, word_count_threshold=request.word_count_threshold, css_selector=request.css_selector, excluded_tags=request.excluded_tags or ["nav", "footer", "header"], exclude_external_links=request.exclude_external_links, wait_for=f"css:{request.css_selector}" if request.css_selector else None, wait_for_images=True, page_timeout=30000, scan_full_page=request.scan_full_page, scroll_delay=request.scroll_delay, extraction_strategy=create_extraction_strategy( request.extraction_type, request.css_selector, request.xpath_query ) ) # Docker-optimized browser configuration browser_config = BrowserConfig( headless=True, viewport_width=1920, viewport_height=1080, chrome_path=CHROME_PATH, args=[ "--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu" ] ) results = { "pages": [], "total_links": 0, "visited_pages": len(visited) } try: async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=request.url, config=run_config) if not result.success: print(f"Failed to crawl {request.url}: {result.error_message}") return None page_result = { "url": request.url, "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "", "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None, "depth": current_depth } results["pages"].append(page_result) visited.add(normalized_url) if request.crawl_subpages and hasattr(result, 'links'): internal_links = result.links.get("internal", []) if internal_links: results["total_links"] += len(internal_links) for link in internal_links: if len(visited) >= request.max_pages: break try: normalized_link = urllib.parse.urljoin(request.url, link) link_domain = urllib.parse.urlparse(normalized_link).netloc if normalized_link in visited or (request.exclude_external_links and link_domain != base_url): continue sub_request = CrawlRequest( **{**request.dict(), "url": normalized_link} ) sub_result = await crawl_with_subpages( sub_request, base_url, current_depth + 1, visited ) if sub_result: results["pages"].extend(sub_result["pages"]) results["total_links"] += sub_result["total_links"] results["visited_pages"] = len(visited) except Exception as e: print(f"Error processing link {link}: {str(e)}") continue return results except Exception as e: print(f"Error crawling {request.url}: {str(e)}") return None async def crawl_url(request: CrawlRequest) -> Dict: """Crawl a URL and return the extracted content.""" try: base_url = urllib.parse.urlparse(request.url).netloc if request.crawl_subpages: results = await crawl_with_subpages(request, base_url) if not results or not results["pages"]: raise Exception(f"Failed to crawl pages starting from {request.url}") combined_markdown = "\\n\\n---\\n\\n".join( f"## Page: {page['url']}\\n{page['markdown']}" for page in results["pages"] ) return { "markdown": combined_markdown, "metadata": { "url": request.url, "crawler_type": request.crawler_type.value, "extraction_type": request.extraction_type.value, "word_count_threshold": request.word_count_threshold, "css_selector": request.css_selector, "xpath_query": request.xpath_query, "scan_full_page": request.scan_full_page, "scroll_delay": request.scroll_delay, "total_pages_crawled": results["visited_pages"], "total_links_found": results["total_links"], "max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"])) }, "pages": results["pages"] } else: wait_condition = f"css:{request.css_selector}" if request.css_selector else None run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=request.word_count_threshold, css_selector=request.css_selector, excluded_tags=request.excluded_tags or ["nav", "footer", "header"], wait_for=wait_condition, wait_for_images=True, page_timeout=30000, scan_full_page=request.scan_full_page, scroll_delay=request.scroll_delay, extraction_strategy=create_extraction_strategy( request.extraction_type, request.css_selector, request.xpath_query ) ) # Docker-optimized browser configuration browser_config = BrowserConfig( headless=True, viewport_width=1920, viewport_height=1080, chrome_path=CHROME_PATH, args=[ "--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu" ] ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=request.url, config=run_config) if not result.success: raise Exception(result.error_message) images = result.media.get("images", []) if hasattr(result, 'media') else [] image_info = "\n### Images Found\n" if images else "" for i, img in enumerate(images[:5]): image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n" if img.get('alt'): image_info += f" Alt: {img['alt']}\n" if img.get('score'): image_info += f" Score: {img['score']}\n" return { "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "", "metadata": { "url": request.url, "crawler_type": request.crawler_type.value, "extraction_type": request.extraction_type.value, "word_count_threshold": request.word_count_threshold, "css_selector": request.css_selector, "xpath_query": request.xpath_query, "scan_full_page": request.scan_full_page, "scroll_delay": request.scroll_delay, "wait_condition": wait_condition }, "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None, "image_info": image_info } except Exception as e: raise Exception(str(e)) async def gradio_crawl( url: str, crawler_type: str, extraction_type: str, word_count_threshold: int, css_selector: str, xpath_query: str, scan_full_page: bool, scroll_delay: float, crawl_subpages: bool, max_depth: int, max_pages: int, exclude_external_links: bool ) -> tuple[str, str]: """Handle crawling requests from the Gradio interface.""" try: request = CrawlRequest( url=url, crawler_type=CrawlerType(crawler_type.lower()), extraction_type=ExtractionType(extraction_type.lower()), word_count_threshold=word_count_threshold, css_selector=css_selector if css_selector else None, xpath_query=xpath_query if xpath_query else None, scan_full_page=scan_full_page, scroll_delay=scroll_delay, crawl_subpages=crawl_subpages, max_depth=max_depth, max_pages=max_pages, exclude_external_links=exclude_external_links ) result = await crawl_url(request) markdown_content = str(result["markdown"]) if result.get("markdown") else "" metadata_str = f"""### Metadata - URL: {result['metadata']['url']} - Crawler Type: {result['metadata']['crawler_type']} - Extraction Type: {result['metadata']['extraction_type']} - Word Count Threshold: {result['metadata']['word_count_threshold']} - CSS Selector: {result['metadata']['css_selector'] or 'None'} - XPath Query: {result['metadata']['xpath_query'] or 'None'} - Full Page Scan: {result['metadata']['scan_full_page']} - Scroll Delay: {result['metadata']['scroll_delay']}s""" if crawl_subpages: metadata_str += f""" - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)} - Total Links Found: {result['metadata'].get('total_links_found', 0)} - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}""" if result.get('image_info'): metadata_str += f"\n\n{result['image_info']}" if result.get("extracted_content"): metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```" return markdown_content, metadata_str except Exception as e: error_msg = f"Error: {str(e)}" return error_msg, "Error occurred while crawling" # Create Gradio interface with simplified configuration with gr.Blocks(title="Crawl4AI Demo") as demo: gr.Markdown(""" # Crawl4AI Web Content Extractor Extract content from web pages using different crawling and extraction strategies. """) with gr.Row(): with gr.Column(): url_input = gr.Textbox( label="URL", placeholder="Enter URL to crawl", info="The webpage URL to extract content from" ) crawler_type = gr.Dropdown( choices=["Basic", "LLM", "Cosine", "JSON/CSS"], label="Crawler Type", value="Basic", info="Select the content extraction strategy" ) extraction_type = gr.Dropdown( choices=["Default", "CSS", "XPath", "LLM", "Combined"], label="Extraction Type", value="Default", info="Choose how to extract content from the page" ) word_count = gr.Slider( minimum=50, maximum=500, value=100, step=50, label="Word Count Threshold", info="Minimum number of words required for content extraction" ) css_selector = gr.Textbox( label="CSS Selector", placeholder="e.g., article.content, main.post", info="CSS selector to target specific content" ) xpath_query = gr.Textbox( label="XPath Query", placeholder="e.g., //article[@class='content']", info="XPath query to target specific content" ) with gr.Column(): scan_full_page = gr.Checkbox( label="Scan Full Page", value=False, info="Enable to scroll through the entire page" ) scroll_delay = gr.Slider( minimum=0.1, maximum=2.0, value=0.5, step=0.1, label="Scroll Delay", info="Delay between scroll steps in seconds" ) crawl_subpages = gr.Checkbox( label="Crawl Sub-pages", value=False, info="Enable to crawl links found on the page" ) max_depth = gr.Slider( minimum=1, maximum=5, value=1, step=1, label="Max Crawl Depth", info="Maximum depth for recursive crawling" ) max_pages = gr.Slider( minimum=1, maximum=50, value=10, step=5, label="Max Pages", info="Maximum number of pages to crawl" ) exclude_external = gr.Checkbox( label="Exclude External Links", value=True, info="Only crawl links within the same domain" ) with gr.Row(): crawl_button = gr.Button("Start Crawling") with gr.Row(): output_markdown = gr.Markdown(label="Generated Markdown") output_metadata = gr.Markdown(label="Metadata & Results") crawl_button.click( fn=gradio_crawl, inputs=[ url_input, crawler_type, extraction_type, word_count, css_selector, xpath_query, scan_full_page, scroll_delay, crawl_subpages, max_depth, max_pages, exclude_external ], outputs=[output_markdown, output_metadata] ) gr.Examples( examples=[ ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True], ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True], ], inputs=[ url_input, crawler_type, extraction_type, word_count, css_selector, xpath_query, scan_full_page, scroll_delay, crawl_subpages, max_depth, max_pages, exclude_external ] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)