hellorahulk commited on
Commit
0034b95
·
verified ·
1 Parent(s): a87d676

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +641 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Crawl4AI Demo Application
3
+ ========================
4
+
5
+ This application provides a web interface and API for the Crawl4AI library, allowing users to extract
6
+ content from web pages using different crawling strategies.
7
+
8
+ Features:
9
+ ---------
10
+ - Web interface built with Gradio for interactive use
11
+ - RESTful API endpoint for programmatic access
12
+ - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
13
+ - Configurable word count threshold
14
+ - Markdown output with metadata
15
+
16
+ Usage:
17
+ ------
18
+ 1. Start the server:
19
+ ```
20
+ python app.py
21
+ ```
22
+ 2. Access the web interface at http://localhost:8000
23
+ 3. Use the API endpoint at http://localhost:8000/api/crawl
24
+
25
+ API Example:
26
+ -----------
27
+ ```python
28
+ import requests
29
+
30
+ response = requests.post(
31
+ "http://localhost:8000/api/crawl",
32
+ json={
33
+ "url": "https://example.com",
34
+ "crawler_type": "basic",
35
+ "word_count_threshold": 100
36
+ }
37
+ )
38
+ result = response.json()
39
+ ```
40
+
41
+ Dependencies:
42
+ ------------
43
+ - gradio
44
+ - fastapi
45
+ - crawl4ai
46
+ - uvicorn
47
+ """
48
+
49
+ import gradio as gr
50
+ import asyncio
51
+ from fastapi import FastAPI, HTTPException
52
+ from pydantic import BaseModel
53
+ from enum import Enum
54
+ from typing import Optional, Dict, Any, List, Set
55
+ from contextlib import asynccontextmanager
56
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
57
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
58
+ from playwright.async_api import async_playwright
59
+ import urllib.parse
60
+
61
+ class CrawlerType(str, Enum):
62
+ """
63
+ Enumeration of supported crawler types.
64
+
65
+ Attributes:
66
+ BASIC (str): Simple HTML parsing and content extraction
67
+ LLM (str): Language model-based content extraction
68
+ COSINE (str): Cosine similarity-based content extraction
69
+ JSON_CSS (str): JSON/CSS selector-based content extraction
70
+ """
71
+ BASIC = "basic"
72
+ LLM = "llm"
73
+ COSINE = "cosine"
74
+ JSON_CSS = "json_css"
75
+
76
+ class ExtractionType(str, Enum):
77
+ """
78
+ Enumeration of supported extraction strategies.
79
+
80
+ Attributes:
81
+ DEFAULT (str): Default extraction without specific strategy
82
+ CSS (str): CSS selector-based extraction
83
+ XPATH (str): XPath-based extraction
84
+ LLM (str): Language model-based extraction
85
+ COMBINED (str): Combined strategy using multiple approaches
86
+ """
87
+ DEFAULT = "default"
88
+ CSS = "css"
89
+ XPATH = "xpath"
90
+ LLM = "llm"
91
+ COMBINED = "combined"
92
+
93
+ class CrawlRequest(BaseModel):
94
+ """
95
+ Request model for crawling operations.
96
+
97
+ Attributes:
98
+ url (str): The URL to crawl
99
+ crawler_type (CrawlerType): The type of crawler to use
100
+ extraction_type (ExtractionType): The extraction strategy to use
101
+ word_count_threshold (int): Minimum word count for extracted content
102
+ css_selector (Optional[str]): CSS selector for content extraction
103
+ xpath_query (Optional[str]): XPath query for content extraction
104
+ excluded_tags (Optional[list]): HTML tags to exclude from extraction
105
+ scan_full_page (bool): Whether to scan the entire page for lazy-loaded content
106
+ scroll_delay (float): Delay between scroll steps in seconds
107
+ crawl_subpages (bool): Whether to crawl sub-pages found in links
108
+ max_depth (int): Maximum depth for recursive crawling (1 = only direct links)
109
+ exclude_external_links (bool): Whether to exclude links to external domains
110
+ max_pages (int): Maximum number of pages to crawl
111
+ """
112
+ url: str
113
+ crawler_type: CrawlerType = CrawlerType.BASIC
114
+ extraction_type: ExtractionType = ExtractionType.DEFAULT
115
+ word_count_threshold: int = 100
116
+ css_selector: Optional[str] = None
117
+ xpath_query: Optional[str] = None
118
+ excluded_tags: Optional[list] = None
119
+ scan_full_page: bool = False
120
+ scroll_delay: float = 0.5
121
+ crawl_subpages: bool = False
122
+ max_depth: int = 1
123
+ exclude_external_links: bool = True
124
+ max_pages: int = 10
125
+
126
+ # Global crawler variable
127
+ crawler = None
128
+
129
+ @asynccontextmanager
130
+ async def lifespan(app: FastAPI):
131
+ """
132
+ Lifespan context manager for FastAPI application.
133
+ Handles crawler initialization and cleanup.
134
+ """
135
+ global crawler
136
+
137
+ # Initialize browser configuration
138
+ browser_config = BrowserConfig(
139
+ headless=True,
140
+ viewport_width=1920,
141
+ viewport_height=1080
142
+ )
143
+
144
+ # Create and initialize crawler
145
+ try:
146
+ crawler = AsyncWebCrawler(config=browser_config)
147
+ print("Crawler initialized successfully")
148
+ yield
149
+ finally:
150
+ if crawler:
151
+ await crawler.close()
152
+ print("Crawler resources cleaned up")
153
+
154
+ # Create FastAPI app with lifespan handler
155
+ app = FastAPI(
156
+ title="Crawl4AI Demo",
157
+ description="A web interface and API for extracting content from web pages using Crawl4AI",
158
+ version="1.0.0",
159
+ lifespan=lifespan
160
+ )
161
+
162
+ @app.on_event("startup")
163
+ async def startup_event():
164
+ """Initialize the browser on startup"""
165
+ try:
166
+ async with async_playwright() as playwright:
167
+ await crawler.initialize(playwright)
168
+ except Exception as e:
169
+ print(f"Error initializing browser: {e}")
170
+ raise
171
+
172
+ @app.on_event("shutdown")
173
+ async def shutdown_event():
174
+ """Clean up browser resources on shutdown"""
175
+ try:
176
+ await crawler.cleanup()
177
+ except Exception as e:
178
+ print(f"Error during cleanup: {e}")
179
+
180
+ def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
181
+ """
182
+ Create an extraction strategy based on the specified type.
183
+
184
+ Args:
185
+ extraction_type (ExtractionType): The type of extraction strategy
186
+ css_selector (Optional[str]): CSS selector for content extraction
187
+ xpath_query (Optional[str]): XPath query for content extraction
188
+
189
+ Returns:
190
+ Any: The configured extraction strategy
191
+ """
192
+ if extraction_type == ExtractionType.CSS and css_selector:
193
+ schema = {
194
+ "name": "Content",
195
+ "baseSelector": css_selector,
196
+ "fields": [
197
+ {"name": "title", "selector": "h1,h2", "type": "text"},
198
+ {"name": "text", "selector": "p", "type": "text"},
199
+ {"name": "links", "selector": "a", "type": "attribute", "attribute": "href"}
200
+ ]
201
+ }
202
+ return JsonCssExtractionStrategy(schema)
203
+ return None
204
+
205
+ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
206
+ """
207
+ Recursively crawl pages including sub-pages up to the specified depth.
208
+ """
209
+ if visited is None:
210
+ visited = set()
211
+
212
+ if current_depth > request.max_depth or len(visited) >= request.max_pages:
213
+ return None
214
+
215
+ # Normalize URL to avoid duplicates
216
+ normalized_url = urllib.parse.urljoin(request.url, '/')
217
+ if normalized_url in visited:
218
+ return None
219
+
220
+ # Create run configuration for current page
221
+ run_config = CrawlerRunConfig(
222
+ # Core settings
223
+ cache_mode=CacheMode.BYPASS,
224
+ verbose=True, # Enable verbose logging
225
+
226
+ # Content settings
227
+ word_count_threshold=request.word_count_threshold,
228
+ css_selector=request.css_selector,
229
+ excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
230
+ exclude_external_links=request.exclude_external_links,
231
+
232
+ # Page & JS settings
233
+ wait_for=f"css:{request.css_selector}" if request.css_selector else None,
234
+ wait_for_images=True,
235
+ page_timeout=30000,
236
+
237
+ # Lazy loading settings
238
+ scan_full_page=request.scan_full_page,
239
+ scroll_delay=request.scroll_delay,
240
+
241
+ # Extraction settings
242
+ extraction_strategy=create_extraction_strategy(
243
+ request.extraction_type,
244
+ request.css_selector,
245
+ request.xpath_query
246
+ )
247
+ )
248
+
249
+ browser_config = BrowserConfig(
250
+ headless=True,
251
+ viewport_width=1920,
252
+ viewport_height=1080
253
+ )
254
+
255
+ results = {
256
+ "pages": [],
257
+ "total_links": 0,
258
+ "visited_pages": len(visited)
259
+ }
260
+
261
+ try:
262
+ async with AsyncWebCrawler(config=browser_config) as crawler:
263
+ result = await crawler.arun(url=request.url, config=run_config)
264
+
265
+ if not result.success:
266
+ print(f"Failed to crawl {request.url}: {result.error_message}")
267
+ return None
268
+
269
+ # Add current page result
270
+ page_result = {
271
+ "url": request.url,
272
+ "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
273
+ "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
274
+ "depth": current_depth
275
+ }
276
+ results["pages"].append(page_result)
277
+ visited.add(normalized_url)
278
+
279
+ # Process sub-pages if enabled
280
+ if request.crawl_subpages and hasattr(result, 'links'):
281
+ internal_links = result.links.get("internal", [])
282
+ if internal_links:
283
+ results["total_links"] += len(internal_links)
284
+
285
+ for link in internal_links:
286
+ if len(visited) >= request.max_pages:
287
+ break
288
+
289
+ # Normalize and validate the link
290
+ try:
291
+ normalized_link = urllib.parse.urljoin(request.url, link)
292
+ link_domain = urllib.parse.urlparse(normalized_link).netloc
293
+
294
+ # Skip if already visited or external link
295
+ if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
296
+ continue
297
+
298
+ # Create new request for sub-page
299
+ sub_request = CrawlRequest(
300
+ **{**request.dict(), "url": normalized_link}
301
+ )
302
+
303
+ # Recursively crawl sub-page
304
+ sub_result = await crawl_with_subpages(
305
+ sub_request,
306
+ base_url,
307
+ current_depth + 1,
308
+ visited
309
+ )
310
+
311
+ if sub_result:
312
+ results["pages"].extend(sub_result["pages"])
313
+ results["total_links"] += sub_result["total_links"]
314
+ results["visited_pages"] = len(visited)
315
+ except Exception as e:
316
+ print(f"Error processing link {link}: {str(e)}")
317
+ continue
318
+
319
+ return results
320
+ except Exception as e:
321
+ print(f"Error crawling {request.url}: {str(e)}")
322
+ return None
323
+
324
+ @app.post("/api/crawl")
325
+ async def crawl_url(request: CrawlRequest):
326
+ """
327
+ API endpoint to crawl a URL and return the extracted content.
328
+ """
329
+ try:
330
+ base_url = urllib.parse.urlparse(request.url).netloc
331
+
332
+ if request.crawl_subpages:
333
+ results = await crawl_with_subpages(request, base_url)
334
+ if not results or not results["pages"]:
335
+ raise HTTPException(status_code=500, detail=f"Failed to crawl pages starting from {request.url}")
336
+
337
+ # Combine results from all pages
338
+ combined_markdown = "\\n\\n---\\n\\n".join(
339
+ f"## Page: {page['url']}\\n{page['markdown']}"
340
+ for page in results["pages"]
341
+ )
342
+
343
+ return {
344
+ "markdown": combined_markdown,
345
+ "metadata": {
346
+ "url": request.url,
347
+ "crawler_type": request.crawler_type.value,
348
+ "extraction_type": request.extraction_type.value,
349
+ "word_count_threshold": request.word_count_threshold,
350
+ "css_selector": request.css_selector,
351
+ "xpath_query": request.xpath_query,
352
+ "scan_full_page": request.scan_full_page,
353
+ "scroll_delay": request.scroll_delay,
354
+ "total_pages_crawled": results["visited_pages"],
355
+ "total_links_found": results["total_links"],
356
+ "max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"]))
357
+ },
358
+ "pages": results["pages"]
359
+ }
360
+ else:
361
+ # Format wait_for condition properly if CSS selector is provided
362
+ wait_condition = f"css:{request.css_selector}" if request.css_selector else None
363
+
364
+ # Create run configuration
365
+ run_config = CrawlerRunConfig(
366
+ # Core settings
367
+ cache_mode=CacheMode.BYPASS,
368
+
369
+ # Content settings
370
+ word_count_threshold=request.word_count_threshold,
371
+ css_selector=request.css_selector,
372
+ excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
373
+
374
+ # Page & JS settings
375
+ wait_for=wait_condition, # Using properly formatted wait condition
376
+ wait_for_images=True, # Always wait for images to load
377
+ page_timeout=30000, # 30 seconds timeout for page operations
378
+
379
+ # Lazy loading settings
380
+ scan_full_page=request.scan_full_page,
381
+ scroll_delay=request.scroll_delay,
382
+
383
+ # Extraction settings
384
+ extraction_strategy=create_extraction_strategy(
385
+ request.extraction_type,
386
+ request.css_selector,
387
+ request.xpath_query
388
+ )
389
+ )
390
+
391
+ # Create browser config with optimized settings
392
+ browser_config = BrowserConfig(
393
+ headless=True,
394
+ viewport_width=1920,
395
+ viewport_height=1080
396
+ )
397
+
398
+ async with AsyncWebCrawler(config=browser_config) as temp_crawler:
399
+ try:
400
+ result = await temp_crawler.arun(
401
+ url=request.url,
402
+ config=run_config
403
+ )
404
+
405
+ if not result.success:
406
+ raise HTTPException(status_code=500, detail=result.error_message)
407
+
408
+ # Get image information
409
+ images = result.media.get("images", []) if hasattr(result, 'media') else []
410
+ image_info = "\n### Images Found\n" if images else ""
411
+ for i, img in enumerate(images[:5]): # Show first 5 images
412
+ image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
413
+ if img.get('alt'):
414
+ image_info += f" Alt: {img['alt']}\n"
415
+ if img.get('score'):
416
+ image_info += f" Score: {img['score']}\n"
417
+
418
+ return {
419
+ "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
420
+ "metadata": {
421
+ "url": request.url,
422
+ "crawler_type": request.crawler_type.value,
423
+ "extraction_type": request.extraction_type.value,
424
+ "word_count_threshold": request.word_count_threshold,
425
+ "css_selector": request.css_selector,
426
+ "xpath_query": request.xpath_query,
427
+ "scan_full_page": request.scan_full_page,
428
+ "scroll_delay": request.scroll_delay,
429
+ "wait_condition": wait_condition
430
+ },
431
+ "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
432
+ "image_info": image_info
433
+ }
434
+ except Exception as e:
435
+ # More specific error handling
436
+ error_msg = str(e)
437
+ if "Wait condition failed" in error_msg:
438
+ error_msg = f"Failed to find element matching selector '{request.css_selector}'. Please check if the selector is correct."
439
+ elif "TimeoutError" in error_msg:
440
+ error_msg = "Page took too long to load. Please try again or check the URL."
441
+ raise HTTPException(status_code=500, detail=error_msg)
442
+ except Exception as e:
443
+ raise HTTPException(status_code=500, detail=str(e))
444
+
445
+ async def gradio_crawl(
446
+ url: str,
447
+ crawler_type: str,
448
+ extraction_type: str,
449
+ word_count_threshold: int,
450
+ css_selector: str,
451
+ xpath_query: str,
452
+ scan_full_page: bool,
453
+ scroll_delay: float,
454
+ crawl_subpages: bool,
455
+ max_depth: int,
456
+ max_pages: int,
457
+ exclude_external_links: bool
458
+ ) -> tuple[str, str]:
459
+ """
460
+ Gradio interface function to handle crawling requests from the web UI.
461
+
462
+ Args:
463
+ url (str): The webpage URL to crawl
464
+ crawler_type (str): Type of crawler to use
465
+ extraction_type (str): Type of extraction strategy
466
+ word_count_threshold (int): Minimum word count threshold
467
+ css_selector (str): CSS selector for content targeting
468
+ xpath_query (str): XPath query for content targeting
469
+ scan_full_page (bool): Whether to scan the full page
470
+ scroll_delay (float): Delay between scroll steps
471
+ crawl_subpages (bool): Whether to crawl sub-pages
472
+ max_depth (int): Maximum crawl depth
473
+ max_pages (int): Maximum number of pages to crawl
474
+ exclude_external_links (bool): Whether to exclude external links
475
+
476
+ Returns:
477
+ tuple[str, str]: Tuple containing (markdown_content, metadata_string)
478
+ """
479
+ request = CrawlRequest(
480
+ url=url,
481
+ crawler_type=CrawlerType(crawler_type.lower()),
482
+ extraction_type=ExtractionType(extraction_type.lower()),
483
+ word_count_threshold=word_count_threshold,
484
+ css_selector=css_selector if css_selector else None,
485
+ xpath_query=xpath_query if xpath_query else None,
486
+ scan_full_page=scan_full_page,
487
+ scroll_delay=scroll_delay,
488
+ crawl_subpages=crawl_subpages,
489
+ max_depth=max_depth,
490
+ max_pages=max_pages,
491
+ exclude_external_links=exclude_external_links
492
+ )
493
+
494
+ try:
495
+ result = await crawl_url(request)
496
+
497
+ # Convert markdown result to string if it exists
498
+ markdown_content = str(result["markdown"]) if result.get("markdown") else ""
499
+
500
+ # Format the metadata and results
501
+ metadata_str = f"""### Metadata
502
+ - URL: {result['metadata']['url']}
503
+ - Crawler Type: {result['metadata']['crawler_type']}
504
+ - Extraction Type: {result['metadata']['extraction_type']}
505
+ - Word Count Threshold: {result['metadata']['word_count_threshold']}
506
+ - CSS Selector: {result['metadata']['css_selector'] or 'None'}
507
+ - XPath Query: {result['metadata']['xpath_query'] or 'None'}
508
+ - Full Page Scan: {result['metadata']['scan_full_page']}
509
+ - Scroll Delay: {result['metadata']['scroll_delay']}s"""
510
+
511
+ # Add sub-page crawling information if enabled
512
+ if crawl_subpages:
513
+ metadata_str += f"""
514
+ - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
515
+ - Total Links Found: {result['metadata'].get('total_links_found', 0)}
516
+ - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
517
+
518
+ # Add image information if available
519
+ if result.get('image_info'):
520
+ metadata_str += f"\n\n{result['image_info']}"
521
+
522
+ # Add extracted content if available
523
+ if result.get("extracted_content"):
524
+ metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
525
+
526
+ return markdown_content, metadata_str
527
+ except Exception as e:
528
+ error_msg = f"Error: {str(e)}"
529
+ return error_msg, "Error occurred while crawling"
530
+
531
+ # Create Gradio interface with enhanced documentation
532
+ demo = gr.Interface(
533
+ fn=gradio_crawl,
534
+ inputs=[
535
+ gr.Textbox(
536
+ label="URL",
537
+ placeholder="Enter URL to crawl",
538
+ info="The webpage URL to extract content from"
539
+ ),
540
+ gr.Dropdown(
541
+ choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
542
+ label="Crawler Type",
543
+ value="Basic",
544
+ info="Select the content extraction strategy"
545
+ ),
546
+ gr.Dropdown(
547
+ choices=["Default", "CSS", "XPath", "LLM", "Combined"],
548
+ label="Extraction Type",
549
+ value="Default",
550
+ info="Choose how to extract content from the page"
551
+ ),
552
+ gr.Slider(
553
+ minimum=50,
554
+ maximum=500,
555
+ value=100,
556
+ step=50,
557
+ label="Word Count Threshold",
558
+ info="Minimum number of words required for content extraction"
559
+ ),
560
+ gr.Textbox(
561
+ label="CSS Selector",
562
+ placeholder="e.g., article.content, main.post",
563
+ info="CSS selector to target specific content (used with CSS extraction type)"
564
+ ),
565
+ gr.Textbox(
566
+ label="XPath Query",
567
+ placeholder="e.g., //article[@class='content']",
568
+ info="XPath query to target specific content (used with XPath extraction type)"
569
+ ),
570
+ gr.Checkbox(
571
+ label="Scan Full Page",
572
+ value=False,
573
+ info="Enable to scroll through the entire page to load lazy content"
574
+ ),
575
+ gr.Slider(
576
+ minimum=0.1,
577
+ maximum=2.0,
578
+ value=0.5,
579
+ step=0.1,
580
+ label="Scroll Delay",
581
+ info="Delay between scroll steps in seconds when scanning full page"
582
+ ),
583
+ gr.Checkbox(
584
+ label="Crawl Sub-pages",
585
+ value=False,
586
+ info="Enable to crawl links found on the page"
587
+ ),
588
+ gr.Slider(
589
+ minimum=1,
590
+ maximum=5,
591
+ value=1,
592
+ step=1,
593
+ label="Max Crawl Depth",
594
+ info="Maximum depth for recursive crawling (1 = only direct links)"
595
+ ),
596
+ gr.Slider(
597
+ minimum=1,
598
+ maximum=50,
599
+ value=10,
600
+ step=5,
601
+ label="Max Pages",
602
+ info="Maximum number of pages to crawl"
603
+ ),
604
+ gr.Checkbox(
605
+ label="Exclude External Links",
606
+ value=True,
607
+ info="Only crawl links within the same domain"
608
+ )
609
+ ],
610
+ outputs=[
611
+ gr.Markdown(label="Generated Markdown"),
612
+ gr.Markdown(label="Metadata & Extraction Results")
613
+ ],
614
+ title="Crawl4AI Demo",
615
+ description="""
616
+ This demo allows you to extract content from web pages using different crawling and extraction strategies.
617
+
618
+ 1. Enter a URL to crawl
619
+ 2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS)
620
+ 3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
621
+ 4. Configure additional options:
622
+ - Word count threshold for content filtering
623
+ - CSS selectors for targeting specific content
624
+ - XPath queries for precise extraction
625
+ - Full page scanning for lazy-loaded content
626
+ - Scroll delay for controlling page scanning speed
627
+ - Sub-page crawling with depth control
628
+ - Maximum number of pages to crawl
629
+ - External link filtering
630
+
631
+ The extracted content will be displayed in markdown format along with metadata and extraction results.
632
+ When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
633
+ """
634
+ )
635
+
636
+ # Mount Gradio app to FastAPI
637
+ app = gr.mount_gradio_app(app, demo, path="/")
638
+
639
+ if __name__ == "__main__":
640
+ import uvicorn
641
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ crawl4ai>=0.4.3b0
2
+ fastapi>=0.104.1
3
+ uvicorn>=0.24.0
4
+ gradio==4.0.0
5
+ python-dotenv>=1.0.0
6
+ pydantic>=2.5.0
7
+ aiofiles==23.2.1