hellorahulk commited on
Commit
fede52d
·
verified ·
1 Parent(s): 0c96379

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -106
app.py CHANGED
@@ -348,114 +348,123 @@ async def gradio_crawl(
348
  error_msg = f"Error: {str(e)}"
349
  return error_msg, "Error occurred while crawling"
350
 
351
- # Create Gradio interface with Docker-optimized settings
352
- demo = gr.Interface(
353
- fn=gradio_crawl,
354
- inputs=[
355
- gr.Textbox(
356
- label="URL",
357
- placeholder="Enter URL to crawl",
358
- info="The webpage URL to extract content from"
359
- ),
360
- gr.Dropdown(
361
- choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
362
- label="Crawler Type",
363
- value="Basic",
364
- info="Select the content extraction strategy"
365
- ),
366
- gr.Dropdown(
367
- choices=["Default", "CSS", "XPath", "LLM", "Combined"],
368
- label="Extraction Type",
369
- value="Default",
370
- info="Choose how to extract content from the page"
371
- ),
372
- gr.Slider(
373
- minimum=50,
374
- maximum=500,
375
- value=100,
376
- step=50,
377
- label="Word Count Threshold",
378
- info="Minimum number of words required for content extraction"
379
- ),
380
- gr.Textbox(
381
- label="CSS Selector",
382
- placeholder="e.g., article.content, main.post",
383
- info="CSS selector to target specific content (used with CSS extraction type)"
384
- ),
385
- gr.Textbox(
386
- label="XPath Query",
387
- placeholder="e.g., //article[@class='content']",
388
- info="XPath query to target specific content (used with XPath extraction type)"
389
- ),
390
- gr.Checkbox(
391
- label="Scan Full Page",
392
- value=False,
393
- info="Enable to scroll through the entire page to load lazy content"
394
- ),
395
- gr.Slider(
396
- minimum=0.1,
397
- maximum=2.0,
398
- value=0.5,
399
- step=0.1,
400
- label="Scroll Delay",
401
- info="Delay between scroll steps in seconds when scanning full page"
402
- ),
403
- gr.Checkbox(
404
- label="Crawl Sub-pages",
405
- value=False,
406
- info="Enable to crawl links found on the page"
407
- ),
408
- gr.Slider(
409
- minimum=1,
410
- maximum=5,
411
- value=1,
412
- step=1,
413
- label="Max Crawl Depth",
414
- info="Maximum depth for recursive crawling (1 = only direct links)"
415
- ),
416
- gr.Slider(
417
- minimum=1,
418
- maximum=50,
419
- value=10,
420
- step=5,
421
- label="Max Pages",
422
- info="Maximum number of pages to crawl"
423
- ),
424
- gr.Checkbox(
425
- label="Exclude External Links",
426
- value=True,
427
- info="Only crawl links within the same domain"
428
- )
429
- ],
430
- outputs=[
431
- gr.Markdown(label="Generated Markdown"),
432
- gr.Markdown(label="Metadata & Extraction Results")
433
- ],
434
- title="Crawl4AI Demo",
435
- description="""
436
- This demo allows you to extract content from web pages using different crawling and extraction strategies.
437
 
438
- 1. Enter a URL to crawl
439
- 2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS)
440
- 3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
441
- 4. Configure additional options:
442
- - Word count threshold for content filtering
443
- - CSS selectors for targeting specific content
444
- - XPath queries for precise extraction
445
- - Full page scanning for lazy-loaded content
446
- - Scroll delay for controlling page scanning speed
447
- - Sub-page crawling with depth control
448
- - Maximum number of pages to crawl
449
- - External link filtering
450
 
451
- The extracted content will be displayed in markdown format along with metadata and extraction results.
452
- When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
453
- """,
454
- examples=[
455
- ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
456
- ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
457
- ]
458
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
  if __name__ == "__main__":
461
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
348
  error_msg = f"Error: {str(e)}"
349
  return error_msg, "Error occurred while crawling"
350
 
351
+ # Create Gradio interface with simplified configuration
352
+ with gr.Blocks(title="Crawl4AI Demo") as demo:
353
+ gr.Markdown("""
354
+ # Crawl4AI Web Content Extractor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
+ Extract content from web pages using different crawling and extraction strategies.
357
+ """)
 
 
 
 
 
 
 
 
 
 
358
 
359
+ with gr.Row():
360
+ with gr.Column():
361
+ url_input = gr.Textbox(
362
+ label="URL",
363
+ placeholder="Enter URL to crawl",
364
+ info="The webpage URL to extract content from"
365
+ )
366
+ crawler_type = gr.Dropdown(
367
+ choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
368
+ label="Crawler Type",
369
+ value="Basic",
370
+ info="Select the content extraction strategy"
371
+ )
372
+ extraction_type = gr.Dropdown(
373
+ choices=["Default", "CSS", "XPath", "LLM", "Combined"],
374
+ label="Extraction Type",
375
+ value="Default",
376
+ info="Choose how to extract content from the page"
377
+ )
378
+ word_count = gr.Slider(
379
+ minimum=50,
380
+ maximum=500,
381
+ value=100,
382
+ step=50,
383
+ label="Word Count Threshold",
384
+ info="Minimum number of words required for content extraction"
385
+ )
386
+ css_selector = gr.Textbox(
387
+ label="CSS Selector",
388
+ placeholder="e.g., article.content, main.post",
389
+ info="CSS selector to target specific content"
390
+ )
391
+ xpath_query = gr.Textbox(
392
+ label="XPath Query",
393
+ placeholder="e.g., //article[@class='content']",
394
+ info="XPath query to target specific content"
395
+ )
396
+
397
+ with gr.Column():
398
+ scan_full_page = gr.Checkbox(
399
+ label="Scan Full Page",
400
+ value=False,
401
+ info="Enable to scroll through the entire page"
402
+ )
403
+ scroll_delay = gr.Slider(
404
+ minimum=0.1,
405
+ maximum=2.0,
406
+ value=0.5,
407
+ step=0.1,
408
+ label="Scroll Delay",
409
+ info="Delay between scroll steps in seconds"
410
+ )
411
+ crawl_subpages = gr.Checkbox(
412
+ label="Crawl Sub-pages",
413
+ value=False,
414
+ info="Enable to crawl links found on the page"
415
+ )
416
+ max_depth = gr.Slider(
417
+ minimum=1,
418
+ maximum=5,
419
+ value=1,
420
+ step=1,
421
+ label="Max Crawl Depth",
422
+ info="Maximum depth for recursive crawling"
423
+ )
424
+ max_pages = gr.Slider(
425
+ minimum=1,
426
+ maximum=50,
427
+ value=10,
428
+ step=5,
429
+ label="Max Pages",
430
+ info="Maximum number of pages to crawl"
431
+ )
432
+ exclude_external = gr.Checkbox(
433
+ label="Exclude External Links",
434
+ value=True,
435
+ info="Only crawl links within the same domain"
436
+ )
437
+
438
+ with gr.Row():
439
+ crawl_button = gr.Button("Start Crawling")
440
+
441
+ with gr.Row():
442
+ output_markdown = gr.Markdown(label="Generated Markdown")
443
+ output_metadata = gr.Markdown(label="Metadata & Results")
444
+
445
+ crawl_button.click(
446
+ fn=gradio_crawl,
447
+ inputs=[
448
+ url_input, crawler_type, extraction_type,
449
+ word_count, css_selector, xpath_query,
450
+ scan_full_page, scroll_delay, crawl_subpages,
451
+ max_depth, max_pages, exclude_external
452
+ ],
453
+ outputs=[output_markdown, output_metadata]
454
+ )
455
+
456
+ gr.Examples(
457
+ examples=[
458
+ ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
459
+ ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
460
+ ],
461
+ inputs=[
462
+ url_input, crawler_type, extraction_type,
463
+ word_count, css_selector, xpath_query,
464
+ scan_full_page, scroll_delay, crawl_subpages,
465
+ max_depth, max_pages, exclude_external
466
+ ]
467
+ )
468
 
469
  if __name__ == "__main__":
470
  demo.launch(server_name="0.0.0.0", server_port=7860)