crawlitall / app.py
hellorahulk's picture
Update app.py
fede52d verified
"""
Crawl4AI Demo Application (Docker Version)
=======================================
This is a modified version of the Crawl4AI demo application specifically designed
for deployment in a Docker container on Hugging Face Spaces.
Features:
---------
- Web interface built with Gradio for interactive use
- Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
- Configurable word count threshold
- Markdown output with metadata
- Sub-page crawling capabilities
- Lazy loading support
- Docker-optimized configuration
"""
import gradio as gr
import asyncio
from typing import Optional, Dict, Any, List, Set
from enum import Enum
from pydantic import BaseModel
import os
# Set crawl4ai database path before importing crawl4ai
os.environ["CRAWL4AI_DB_PATH"] = "/home/crawler/.crawl4ai"
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import urllib.parse
# Configure browser settings for Docker environment
CHROME_PATH = "/usr/bin/google-chrome-stable"
os.environ["CHROME_PATH"] = CHROME_PATH
class CrawlerType(str, Enum):
"""Enumeration of supported crawler types."""
BASIC = "basic"
LLM = "llm"
COSINE = "cosine"
JSON_CSS = "json_css"
class ExtractionType(str, Enum):
"""Enumeration of supported extraction strategies."""
DEFAULT = "default"
CSS = "css"
XPATH = "xpath"
LLM = "llm"
COMBINED = "combined"
class CrawlRequest(BaseModel):
"""Request model for crawling operations."""
url: str
crawler_type: CrawlerType = CrawlerType.BASIC
extraction_type: ExtractionType = ExtractionType.DEFAULT
word_count_threshold: int = 100
css_selector: Optional[str] = None
xpath_query: Optional[str] = None
excluded_tags: Optional[list] = None
scan_full_page: bool = False
scroll_delay: float = 0.5
crawl_subpages: bool = False
max_depth: int = 1
exclude_external_links: bool = True
max_pages: int = 10
def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
"""Create an extraction strategy based on the specified type."""
if extraction_type == ExtractionType.CSS and css_selector:
schema = {
"name": "Content",
"baseSelector": css_selector,
"fields": [
{"name": "title", "selector": "h1,h2", "type": "text"},
{"name": "text", "selector": "p", "type": "text"},
{"name": "links", "selector": "a", "type": "attribute", "attribute": "href"}
]
}
return JsonCssExtractionStrategy(schema)
return None
async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
"""Recursively crawl pages including sub-pages up to the specified depth."""
if visited is None:
visited = set()
if current_depth > request.max_depth or len(visited) >= request.max_pages:
return None
normalized_url = urllib.parse.urljoin(request.url, '/')
if normalized_url in visited:
return None
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
verbose=True,
word_count_threshold=request.word_count_threshold,
css_selector=request.css_selector,
excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
exclude_external_links=request.exclude_external_links,
wait_for=f"css:{request.css_selector}" if request.css_selector else None,
wait_for_images=True,
page_timeout=30000,
scan_full_page=request.scan_full_page,
scroll_delay=request.scroll_delay,
extraction_strategy=create_extraction_strategy(
request.extraction_type,
request.css_selector,
request.xpath_query
)
)
# Docker-optimized browser configuration
browser_config = BrowserConfig(
headless=True,
viewport_width=1920,
viewport_height=1080,
chrome_path=CHROME_PATH,
args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
]
)
results = {
"pages": [],
"total_links": 0,
"visited_pages": len(visited)
}
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url=request.url, config=run_config)
if not result.success:
print(f"Failed to crawl {request.url}: {result.error_message}")
return None
page_result = {
"url": request.url,
"markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
"extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
"depth": current_depth
}
results["pages"].append(page_result)
visited.add(normalized_url)
if request.crawl_subpages and hasattr(result, 'links'):
internal_links = result.links.get("internal", [])
if internal_links:
results["total_links"] += len(internal_links)
for link in internal_links:
if len(visited) >= request.max_pages:
break
try:
normalized_link = urllib.parse.urljoin(request.url, link)
link_domain = urllib.parse.urlparse(normalized_link).netloc
if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
continue
sub_request = CrawlRequest(
**{**request.dict(), "url": normalized_link}
)
sub_result = await crawl_with_subpages(
sub_request,
base_url,
current_depth + 1,
visited
)
if sub_result:
results["pages"].extend(sub_result["pages"])
results["total_links"] += sub_result["total_links"]
results["visited_pages"] = len(visited)
except Exception as e:
print(f"Error processing link {link}: {str(e)}")
continue
return results
except Exception as e:
print(f"Error crawling {request.url}: {str(e)}")
return None
async def crawl_url(request: CrawlRequest) -> Dict:
"""Crawl a URL and return the extracted content."""
try:
base_url = urllib.parse.urlparse(request.url).netloc
if request.crawl_subpages:
results = await crawl_with_subpages(request, base_url)
if not results or not results["pages"]:
raise Exception(f"Failed to crawl pages starting from {request.url}")
combined_markdown = "\\n\\n---\\n\\n".join(
f"## Page: {page['url']}\\n{page['markdown']}"
for page in results["pages"]
)
return {
"markdown": combined_markdown,
"metadata": {
"url": request.url,
"crawler_type": request.crawler_type.value,
"extraction_type": request.extraction_type.value,
"word_count_threshold": request.word_count_threshold,
"css_selector": request.css_selector,
"xpath_query": request.xpath_query,
"scan_full_page": request.scan_full_page,
"scroll_delay": request.scroll_delay,
"total_pages_crawled": results["visited_pages"],
"total_links_found": results["total_links"],
"max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"]))
},
"pages": results["pages"]
}
else:
wait_condition = f"css:{request.css_selector}" if request.css_selector else None
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=request.word_count_threshold,
css_selector=request.css_selector,
excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
wait_for=wait_condition,
wait_for_images=True,
page_timeout=30000,
scan_full_page=request.scan_full_page,
scroll_delay=request.scroll_delay,
extraction_strategy=create_extraction_strategy(
request.extraction_type,
request.css_selector,
request.xpath_query
)
)
# Docker-optimized browser configuration
browser_config = BrowserConfig(
headless=True,
viewport_width=1920,
viewport_height=1080,
chrome_path=CHROME_PATH,
args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
]
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url=request.url, config=run_config)
if not result.success:
raise Exception(result.error_message)
images = result.media.get("images", []) if hasattr(result, 'media') else []
image_info = "\n### Images Found\n" if images else ""
for i, img in enumerate(images[:5]):
image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
if img.get('alt'):
image_info += f" Alt: {img['alt']}\n"
if img.get('score'):
image_info += f" Score: {img['score']}\n"
return {
"markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
"metadata": {
"url": request.url,
"crawler_type": request.crawler_type.value,
"extraction_type": request.extraction_type.value,
"word_count_threshold": request.word_count_threshold,
"css_selector": request.css_selector,
"xpath_query": request.xpath_query,
"scan_full_page": request.scan_full_page,
"scroll_delay": request.scroll_delay,
"wait_condition": wait_condition
},
"extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
"image_info": image_info
}
except Exception as e:
raise Exception(str(e))
async def gradio_crawl(
url: str,
crawler_type: str,
extraction_type: str,
word_count_threshold: int,
css_selector: str,
xpath_query: str,
scan_full_page: bool,
scroll_delay: float,
crawl_subpages: bool,
max_depth: int,
max_pages: int,
exclude_external_links: bool
) -> tuple[str, str]:
"""Handle crawling requests from the Gradio interface."""
try:
request = CrawlRequest(
url=url,
crawler_type=CrawlerType(crawler_type.lower()),
extraction_type=ExtractionType(extraction_type.lower()),
word_count_threshold=word_count_threshold,
css_selector=css_selector if css_selector else None,
xpath_query=xpath_query if xpath_query else None,
scan_full_page=scan_full_page,
scroll_delay=scroll_delay,
crawl_subpages=crawl_subpages,
max_depth=max_depth,
max_pages=max_pages,
exclude_external_links=exclude_external_links
)
result = await crawl_url(request)
markdown_content = str(result["markdown"]) if result.get("markdown") else ""
metadata_str = f"""### Metadata
- URL: {result['metadata']['url']}
- Crawler Type: {result['metadata']['crawler_type']}
- Extraction Type: {result['metadata']['extraction_type']}
- Word Count Threshold: {result['metadata']['word_count_threshold']}
- CSS Selector: {result['metadata']['css_selector'] or 'None'}
- XPath Query: {result['metadata']['xpath_query'] or 'None'}
- Full Page Scan: {result['metadata']['scan_full_page']}
- Scroll Delay: {result['metadata']['scroll_delay']}s"""
if crawl_subpages:
metadata_str += f"""
- Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
- Total Links Found: {result['metadata'].get('total_links_found', 0)}
- Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
if result.get('image_info'):
metadata_str += f"\n\n{result['image_info']}"
if result.get("extracted_content"):
metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
return markdown_content, metadata_str
except Exception as e:
error_msg = f"Error: {str(e)}"
return error_msg, "Error occurred while crawling"
# Create Gradio interface with simplified configuration
with gr.Blocks(title="Crawl4AI Demo") as demo:
gr.Markdown("""
# Crawl4AI Web Content Extractor
Extract content from web pages using different crawling and extraction strategies.
""")
with gr.Row():
with gr.Column():
url_input = gr.Textbox(
label="URL",
placeholder="Enter URL to crawl",
info="The webpage URL to extract content from"
)
crawler_type = gr.Dropdown(
choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
label="Crawler Type",
value="Basic",
info="Select the content extraction strategy"
)
extraction_type = gr.Dropdown(
choices=["Default", "CSS", "XPath", "LLM", "Combined"],
label="Extraction Type",
value="Default",
info="Choose how to extract content from the page"
)
word_count = gr.Slider(
minimum=50,
maximum=500,
value=100,
step=50,
label="Word Count Threshold",
info="Minimum number of words required for content extraction"
)
css_selector = gr.Textbox(
label="CSS Selector",
placeholder="e.g., article.content, main.post",
info="CSS selector to target specific content"
)
xpath_query = gr.Textbox(
label="XPath Query",
placeholder="e.g., //article[@class='content']",
info="XPath query to target specific content"
)
with gr.Column():
scan_full_page = gr.Checkbox(
label="Scan Full Page",
value=False,
info="Enable to scroll through the entire page"
)
scroll_delay = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.5,
step=0.1,
label="Scroll Delay",
info="Delay between scroll steps in seconds"
)
crawl_subpages = gr.Checkbox(
label="Crawl Sub-pages",
value=False,
info="Enable to crawl links found on the page"
)
max_depth = gr.Slider(
minimum=1,
maximum=5,
value=1,
step=1,
label="Max Crawl Depth",
info="Maximum depth for recursive crawling"
)
max_pages = gr.Slider(
minimum=1,
maximum=50,
value=10,
step=5,
label="Max Pages",
info="Maximum number of pages to crawl"
)
exclude_external = gr.Checkbox(
label="Exclude External Links",
value=True,
info="Only crawl links within the same domain"
)
with gr.Row():
crawl_button = gr.Button("Start Crawling")
with gr.Row():
output_markdown = gr.Markdown(label="Generated Markdown")
output_metadata = gr.Markdown(label="Metadata & Results")
crawl_button.click(
fn=gradio_crawl,
inputs=[
url_input, crawler_type, extraction_type,
word_count, css_selector, xpath_query,
scan_full_page, scroll_delay, crawl_subpages,
max_depth, max_pages, exclude_external
],
outputs=[output_markdown, output_metadata]
)
gr.Examples(
examples=[
["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
],
inputs=[
url_input, crawler_type, extraction_type,
word_count, css_selector, xpath_query,
scan_full_page, scroll_delay, crawl_subpages,
max_depth, max_pages, exclude_external
]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)