Spaces:
Running
Running
Commit
·
df521e6
1
Parent(s):
d99bc8b
update api handle 3
Browse files
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from pydantic import BaseModel, HttpUrl
|
3 |
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
|
|
|
|
4 |
import uvicorn
|
5 |
import asyncio
|
6 |
import nest_asyncio
|
@@ -23,6 +25,7 @@ class CrawlRequest(BaseModel):
|
|
23 |
cache_mode: str = "ENABLED"
|
24 |
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
25 |
remove_overlay_elements: bool = True
|
|
|
26 |
|
27 |
class Article(BaseModel):
|
28 |
title: str
|
@@ -184,12 +187,34 @@ async def crawl_url(request: CrawlRequest):
|
|
184 |
try:
|
185 |
cache_mode = getattr(CacheMode, request.cache_mode)
|
186 |
|
187 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
async with AsyncWebCrawler() as crawler:
|
189 |
config = CrawlerRunConfig(
|
190 |
cache_mode=cache_mode,
|
191 |
excluded_tags=request.excluded_tags,
|
192 |
remove_overlay_elements=request.remove_overlay_elements,
|
|
|
193 |
exclude_external_links=True,
|
194 |
exclude_social_media_links=True,
|
195 |
exclude_external_images=True,
|
@@ -201,15 +226,16 @@ async def crawl_url(request: CrawlRequest):
|
|
201 |
config=config
|
202 |
)
|
203 |
|
204 |
-
#
|
205 |
markdown = result.markdown_v2.raw_markdown
|
206 |
html = result.html
|
207 |
|
208 |
-
# Extract
|
209 |
articles = extract_articles(markdown)
|
210 |
metadata = extract_metadata(markdown, html)
|
211 |
|
212 |
-
# Add source URL to
|
|
|
213 |
for article in articles:
|
214 |
article.source_url = str(request.url)
|
215 |
|
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from pydantic import BaseModel, HttpUrl
|
3 |
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
4 |
+
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
5 |
+
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
6 |
import uvicorn
|
7 |
import asyncio
|
8 |
import nest_asyncio
|
|
|
25 |
cache_mode: str = "ENABLED"
|
26 |
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
27 |
remove_overlay_elements: bool = True
|
28 |
+
subject: Optional[str] = None # Optional subject for content filtering
|
29 |
|
30 |
class Article(BaseModel):
|
31 |
title: str
|
|
|
187 |
try:
|
188 |
cache_mode = getattr(CacheMode, request.cache_mode)
|
189 |
|
190 |
+
# Configure markdown generator based on whether subject is provided
|
191 |
+
if request.subject:
|
192 |
+
# Use BM25 filter when subject is provided
|
193 |
+
content_filter = BM25ContentFilter(
|
194 |
+
user_query=request.subject,
|
195 |
+
bm25_threshold=1.2,
|
196 |
+
use_stemming=True
|
197 |
+
)
|
198 |
+
else:
|
199 |
+
# Use default pruning filter when no subject
|
200 |
+
content_filter = PruningContentFilter(
|
201 |
+
threshold=0.48,
|
202 |
+
threshold_type="fixed",
|
203 |
+
min_word_threshold=50
|
204 |
+
)
|
205 |
+
|
206 |
+
md_generator = DefaultMarkdownGenerator(
|
207 |
+
content_filter=content_filter,
|
208 |
+
options={"ignore_images": True}
|
209 |
+
)
|
210 |
+
|
211 |
+
# Create crawler with configuration
|
212 |
async with AsyncWebCrawler() as crawler:
|
213 |
config = CrawlerRunConfig(
|
214 |
cache_mode=cache_mode,
|
215 |
excluded_tags=request.excluded_tags,
|
216 |
remove_overlay_elements=request.remove_overlay_elements,
|
217 |
+
markdown_generator=md_generator,
|
218 |
exclude_external_links=True,
|
219 |
exclude_social_media_links=True,
|
220 |
exclude_external_images=True,
|
|
|
226 |
config=config
|
227 |
)
|
228 |
|
229 |
+
# Extract content
|
230 |
markdown = result.markdown_v2.raw_markdown
|
231 |
html = result.html
|
232 |
|
233 |
+
# Extract articles and metadata
|
234 |
articles = extract_articles(markdown)
|
235 |
metadata = extract_metadata(markdown, html)
|
236 |
|
237 |
+
# Add source URL and subject to metadata
|
238 |
+
metadata["subject"] = request.subject
|
239 |
for article in articles:
|
240 |
article.source_url = str(request.url)
|
241 |
|