myCrawl4ai

Running

App Files Files Community

NightFury2710 commited on Jan 29

Commit

df521e6

1 Parent(s): d99bc8b

update api handle 3

Browse files

Files changed (1) hide show

app.py +30 -4

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, HttpUrl
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 import uvicorn
 import asyncio
 import nest_asyncio
@@ -23,6 +25,7 @@ class CrawlRequest(BaseModel):
     cache_mode: str = "ENABLED"
     excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
     remove_overlay_elements: bool = True
 class Article(BaseModel):
     title: str
@@ -184,12 +187,34 @@ async def crawl_url(request: CrawlRequest):
     try:
         cache_mode = getattr(CacheMode, request.cache_mode)
-        # Create crawler with correct configuration parameters
         async with AsyncWebCrawler() as crawler:
             config = CrawlerRunConfig(
                 cache_mode=cache_mode,
                 excluded_tags=request.excluded_tags,
                 remove_overlay_elements=request.remove_overlay_elements,
                 exclude_external_links=True,
                 exclude_social_media_links=True,
                 exclude_external_images=True,
@@ -201,15 +226,16 @@ async def crawl_url(request: CrawlRequest):
                 config=config
             )
-            # Use both markdown and HTML results
             markdown = result.markdown_v2.raw_markdown
             html = result.html
-            # Extract content
             articles = extract_articles(markdown)
             metadata = extract_metadata(markdown, html)
-            # Add source URL to articles
             for article in articles:
                 article.source_url = str(request.url)

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, HttpUrl
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
 import uvicorn
 import asyncio
 import nest_asyncio
     cache_mode: str = "ENABLED"
     excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
     remove_overlay_elements: bool = True
+    subject: Optional[str] = None  # Optional subject for content filtering
 class Article(BaseModel):
     title: str
     try:
         cache_mode = getattr(CacheMode, request.cache_mode)
+        # Configure markdown generator based on whether subject is provided
+        if request.subject:
+            # Use BM25 filter when subject is provided
+            content_filter = BM25ContentFilter(
+                user_query=request.subject,
+                bm25_threshold=1.2,
+                use_stemming=True
+            )
+        else:
+            # Use default pruning filter when no subject
+            content_filter = PruningContentFilter(
+                threshold=0.48,
+                threshold_type="fixed",
+                min_word_threshold=50
+            )
+        md_generator = DefaultMarkdownGenerator(
+            content_filter=content_filter,
+            options={"ignore_images": True}
+        )
+        # Create crawler with configuration
         async with AsyncWebCrawler() as crawler:
             config = CrawlerRunConfig(
                 cache_mode=cache_mode,
                 excluded_tags=request.excluded_tags,
                 remove_overlay_elements=request.remove_overlay_elements,
+                markdown_generator=md_generator,
                 exclude_external_links=True,
                 exclude_social_media_links=True,
                 exclude_external_images=True,
                 config=config
             )
+            # Extract content
             markdown = result.markdown_v2.raw_markdown
             html = result.html
+            # Extract articles and metadata
             articles = extract_articles(markdown)
             metadata = extract_metadata(markdown, html)
+            # Add source URL and subject to metadata
+            metadata["subject"] = request.subject
             for article in articles:
                 article.source_url = str(request.url)