Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -348,114 +348,123 @@ async def gradio_crawl(
|
|
348 |
error_msg = f"Error: {str(e)}"
|
349 |
return error_msg, "Error occurred while crawling"
|
350 |
|
351 |
-
# Create Gradio interface with
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
gr.Textbox(
|
356 |
-
label="URL",
|
357 |
-
placeholder="Enter URL to crawl",
|
358 |
-
info="The webpage URL to extract content from"
|
359 |
-
),
|
360 |
-
gr.Dropdown(
|
361 |
-
choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
|
362 |
-
label="Crawler Type",
|
363 |
-
value="Basic",
|
364 |
-
info="Select the content extraction strategy"
|
365 |
-
),
|
366 |
-
gr.Dropdown(
|
367 |
-
choices=["Default", "CSS", "XPath", "LLM", "Combined"],
|
368 |
-
label="Extraction Type",
|
369 |
-
value="Default",
|
370 |
-
info="Choose how to extract content from the page"
|
371 |
-
),
|
372 |
-
gr.Slider(
|
373 |
-
minimum=50,
|
374 |
-
maximum=500,
|
375 |
-
value=100,
|
376 |
-
step=50,
|
377 |
-
label="Word Count Threshold",
|
378 |
-
info="Minimum number of words required for content extraction"
|
379 |
-
),
|
380 |
-
gr.Textbox(
|
381 |
-
label="CSS Selector",
|
382 |
-
placeholder="e.g., article.content, main.post",
|
383 |
-
info="CSS selector to target specific content (used with CSS extraction type)"
|
384 |
-
),
|
385 |
-
gr.Textbox(
|
386 |
-
label="XPath Query",
|
387 |
-
placeholder="e.g., //article[@class='content']",
|
388 |
-
info="XPath query to target specific content (used with XPath extraction type)"
|
389 |
-
),
|
390 |
-
gr.Checkbox(
|
391 |
-
label="Scan Full Page",
|
392 |
-
value=False,
|
393 |
-
info="Enable to scroll through the entire page to load lazy content"
|
394 |
-
),
|
395 |
-
gr.Slider(
|
396 |
-
minimum=0.1,
|
397 |
-
maximum=2.0,
|
398 |
-
value=0.5,
|
399 |
-
step=0.1,
|
400 |
-
label="Scroll Delay",
|
401 |
-
info="Delay between scroll steps in seconds when scanning full page"
|
402 |
-
),
|
403 |
-
gr.Checkbox(
|
404 |
-
label="Crawl Sub-pages",
|
405 |
-
value=False,
|
406 |
-
info="Enable to crawl links found on the page"
|
407 |
-
),
|
408 |
-
gr.Slider(
|
409 |
-
minimum=1,
|
410 |
-
maximum=5,
|
411 |
-
value=1,
|
412 |
-
step=1,
|
413 |
-
label="Max Crawl Depth",
|
414 |
-
info="Maximum depth for recursive crawling (1 = only direct links)"
|
415 |
-
),
|
416 |
-
gr.Slider(
|
417 |
-
minimum=1,
|
418 |
-
maximum=50,
|
419 |
-
value=10,
|
420 |
-
step=5,
|
421 |
-
label="Max Pages",
|
422 |
-
info="Maximum number of pages to crawl"
|
423 |
-
),
|
424 |
-
gr.Checkbox(
|
425 |
-
label="Exclude External Links",
|
426 |
-
value=True,
|
427 |
-
info="Only crawl links within the same domain"
|
428 |
-
)
|
429 |
-
],
|
430 |
-
outputs=[
|
431 |
-
gr.Markdown(label="Generated Markdown"),
|
432 |
-
gr.Markdown(label="Metadata & Extraction Results")
|
433 |
-
],
|
434 |
-
title="Crawl4AI Demo",
|
435 |
-
description="""
|
436 |
-
This demo allows you to extract content from web pages using different crawling and extraction strategies.
|
437 |
|
438 |
-
|
439 |
-
|
440 |
-
3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
|
441 |
-
4. Configure additional options:
|
442 |
-
- Word count threshold for content filtering
|
443 |
-
- CSS selectors for targeting specific content
|
444 |
-
- XPath queries for precise extraction
|
445 |
-
- Full page scanning for lazy-loaded content
|
446 |
-
- Scroll delay for controlling page scanning speed
|
447 |
-
- Sub-page crawling with depth control
|
448 |
-
- Maximum number of pages to crawl
|
449 |
-
- External link filtering
|
450 |
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
459 |
|
460 |
if __name__ == "__main__":
|
461 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
348 |
error_msg = f"Error: {str(e)}"
|
349 |
return error_msg, "Error occurred while crawling"
|
350 |
|
351 |
+
# Create Gradio interface with simplified configuration
|
352 |
+
with gr.Blocks(title="Crawl4AI Demo") as demo:
|
353 |
+
gr.Markdown("""
|
354 |
+
# Crawl4AI Web Content Extractor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
+
Extract content from web pages using different crawling and extraction strategies.
|
357 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
+
with gr.Row():
|
360 |
+
with gr.Column():
|
361 |
+
url_input = gr.Textbox(
|
362 |
+
label="URL",
|
363 |
+
placeholder="Enter URL to crawl",
|
364 |
+
info="The webpage URL to extract content from"
|
365 |
+
)
|
366 |
+
crawler_type = gr.Dropdown(
|
367 |
+
choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
|
368 |
+
label="Crawler Type",
|
369 |
+
value="Basic",
|
370 |
+
info="Select the content extraction strategy"
|
371 |
+
)
|
372 |
+
extraction_type = gr.Dropdown(
|
373 |
+
choices=["Default", "CSS", "XPath", "LLM", "Combined"],
|
374 |
+
label="Extraction Type",
|
375 |
+
value="Default",
|
376 |
+
info="Choose how to extract content from the page"
|
377 |
+
)
|
378 |
+
word_count = gr.Slider(
|
379 |
+
minimum=50,
|
380 |
+
maximum=500,
|
381 |
+
value=100,
|
382 |
+
step=50,
|
383 |
+
label="Word Count Threshold",
|
384 |
+
info="Minimum number of words required for content extraction"
|
385 |
+
)
|
386 |
+
css_selector = gr.Textbox(
|
387 |
+
label="CSS Selector",
|
388 |
+
placeholder="e.g., article.content, main.post",
|
389 |
+
info="CSS selector to target specific content"
|
390 |
+
)
|
391 |
+
xpath_query = gr.Textbox(
|
392 |
+
label="XPath Query",
|
393 |
+
placeholder="e.g., //article[@class='content']",
|
394 |
+
info="XPath query to target specific content"
|
395 |
+
)
|
396 |
+
|
397 |
+
with gr.Column():
|
398 |
+
scan_full_page = gr.Checkbox(
|
399 |
+
label="Scan Full Page",
|
400 |
+
value=False,
|
401 |
+
info="Enable to scroll through the entire page"
|
402 |
+
)
|
403 |
+
scroll_delay = gr.Slider(
|
404 |
+
minimum=0.1,
|
405 |
+
maximum=2.0,
|
406 |
+
value=0.5,
|
407 |
+
step=0.1,
|
408 |
+
label="Scroll Delay",
|
409 |
+
info="Delay between scroll steps in seconds"
|
410 |
+
)
|
411 |
+
crawl_subpages = gr.Checkbox(
|
412 |
+
label="Crawl Sub-pages",
|
413 |
+
value=False,
|
414 |
+
info="Enable to crawl links found on the page"
|
415 |
+
)
|
416 |
+
max_depth = gr.Slider(
|
417 |
+
minimum=1,
|
418 |
+
maximum=5,
|
419 |
+
value=1,
|
420 |
+
step=1,
|
421 |
+
label="Max Crawl Depth",
|
422 |
+
info="Maximum depth for recursive crawling"
|
423 |
+
)
|
424 |
+
max_pages = gr.Slider(
|
425 |
+
minimum=1,
|
426 |
+
maximum=50,
|
427 |
+
value=10,
|
428 |
+
step=5,
|
429 |
+
label="Max Pages",
|
430 |
+
info="Maximum number of pages to crawl"
|
431 |
+
)
|
432 |
+
exclude_external = gr.Checkbox(
|
433 |
+
label="Exclude External Links",
|
434 |
+
value=True,
|
435 |
+
info="Only crawl links within the same domain"
|
436 |
+
)
|
437 |
+
|
438 |
+
with gr.Row():
|
439 |
+
crawl_button = gr.Button("Start Crawling")
|
440 |
+
|
441 |
+
with gr.Row():
|
442 |
+
output_markdown = gr.Markdown(label="Generated Markdown")
|
443 |
+
output_metadata = gr.Markdown(label="Metadata & Results")
|
444 |
+
|
445 |
+
crawl_button.click(
|
446 |
+
fn=gradio_crawl,
|
447 |
+
inputs=[
|
448 |
+
url_input, crawler_type, extraction_type,
|
449 |
+
word_count, css_selector, xpath_query,
|
450 |
+
scan_full_page, scroll_delay, crawl_subpages,
|
451 |
+
max_depth, max_pages, exclude_external
|
452 |
+
],
|
453 |
+
outputs=[output_markdown, output_metadata]
|
454 |
+
)
|
455 |
+
|
456 |
+
gr.Examples(
|
457 |
+
examples=[
|
458 |
+
["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
|
459 |
+
["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
|
460 |
+
],
|
461 |
+
inputs=[
|
462 |
+
url_input, crawler_type, extraction_type,
|
463 |
+
word_count, css_selector, xpath_query,
|
464 |
+
scan_full_page, scroll_delay, crawl_subpages,
|
465 |
+
max_depth, max_pages, exclude_external
|
466 |
+
]
|
467 |
+
)
|
468 |
|
469 |
if __name__ == "__main__":
|
470 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|