NightFury2710 commited on
Commit
df521e6
·
1 Parent(s): d99bc8b

update api handle 3

Browse files
Files changed (1) hide show
  1. app.py +30 -4
app.py CHANGED
@@ -1,6 +1,8 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel, HttpUrl
3
  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 
 
4
  import uvicorn
5
  import asyncio
6
  import nest_asyncio
@@ -23,6 +25,7 @@ class CrawlRequest(BaseModel):
23
  cache_mode: str = "ENABLED"
24
  excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
25
  remove_overlay_elements: bool = True
 
26
 
27
  class Article(BaseModel):
28
  title: str
@@ -184,12 +187,34 @@ async def crawl_url(request: CrawlRequest):
184
  try:
185
  cache_mode = getattr(CacheMode, request.cache_mode)
186
 
187
- # Create crawler with correct configuration parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  async with AsyncWebCrawler() as crawler:
189
  config = CrawlerRunConfig(
190
  cache_mode=cache_mode,
191
  excluded_tags=request.excluded_tags,
192
  remove_overlay_elements=request.remove_overlay_elements,
 
193
  exclude_external_links=True,
194
  exclude_social_media_links=True,
195
  exclude_external_images=True,
@@ -201,15 +226,16 @@ async def crawl_url(request: CrawlRequest):
201
  config=config
202
  )
203
 
204
- # Use both markdown and HTML results
205
  markdown = result.markdown_v2.raw_markdown
206
  html = result.html
207
 
208
- # Extract content
209
  articles = extract_articles(markdown)
210
  metadata = extract_metadata(markdown, html)
211
 
212
- # Add source URL to articles
 
213
  for article in articles:
214
  article.source_url = str(request.url)
215
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel, HttpUrl
3
  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
4
+ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
5
+ from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
6
  import uvicorn
7
  import asyncio
8
  import nest_asyncio
 
25
  cache_mode: str = "ENABLED"
26
  excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
27
  remove_overlay_elements: bool = True
28
+ subject: Optional[str] = None # Optional subject for content filtering
29
 
30
  class Article(BaseModel):
31
  title: str
 
187
  try:
188
  cache_mode = getattr(CacheMode, request.cache_mode)
189
 
190
+ # Configure markdown generator based on whether subject is provided
191
+ if request.subject:
192
+ # Use BM25 filter when subject is provided
193
+ content_filter = BM25ContentFilter(
194
+ user_query=request.subject,
195
+ bm25_threshold=1.2,
196
+ use_stemming=True
197
+ )
198
+ else:
199
+ # Use default pruning filter when no subject
200
+ content_filter = PruningContentFilter(
201
+ threshold=0.48,
202
+ threshold_type="fixed",
203
+ min_word_threshold=50
204
+ )
205
+
206
+ md_generator = DefaultMarkdownGenerator(
207
+ content_filter=content_filter,
208
+ options={"ignore_images": True}
209
+ )
210
+
211
+ # Create crawler with configuration
212
  async with AsyncWebCrawler() as crawler:
213
  config = CrawlerRunConfig(
214
  cache_mode=cache_mode,
215
  excluded_tags=request.excluded_tags,
216
  remove_overlay_elements=request.remove_overlay_elements,
217
+ markdown_generator=md_generator,
218
  exclude_external_links=True,
219
  exclude_social_media_links=True,
220
  exclude_external_images=True,
 
226
  config=config
227
  )
228
 
229
+ # Extract content
230
  markdown = result.markdown_v2.raw_markdown
231
  html = result.html
232
 
233
+ # Extract articles and metadata
234
  articles = extract_articles(markdown)
235
  metadata = extract_metadata(markdown, html)
236
 
237
+ # Add source URL and subject to metadata
238
+ metadata["subject"] = request.subject
239
  for article in articles:
240
  article.source_url = str(request.url)
241