NightFury2710 commited on
Commit
32e353a
·
1 Parent(s): 9a5fc54

update api handle 3

Browse files
Files changed (1) hide show
  1. app.py +8 -10
app.py CHANGED
@@ -22,7 +22,7 @@ app = FastAPI(
22
 
23
  class CrawlRequest(BaseModel):
24
  url: HttpUrl
25
- cache_mode: str = "ENABLED"
26
  excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
27
  remove_overlay_elements: bool = True
28
  subject: Optional[str] = None # Optional subject for content filtering
@@ -185,17 +185,17 @@ def extract_metadata(markdown: str, html: str) -> Dict:
185
  @app.post("/crawl", response_model=CrawlResponse)
186
  async def crawl_url(request: CrawlRequest):
187
  try:
188
- cache_mode = getattr(CacheMode, request.cache_mode)
 
189
 
190
- # Configure markdown generator based on whether subject is provided
191
  if request.subject:
192
- # Use BM25 filter when subject is provided use_stemming=True
193
  content_filter = BM25ContentFilter(
194
  user_query=request.subject,
195
- bm25_threshold=1.2
 
196
  )
197
  else:
198
- # Use default pruning filter when no subject
199
  content_filter = PruningContentFilter(
200
  threshold=0.48,
201
  threshold_type="fixed",
@@ -210,7 +210,7 @@ async def crawl_url(request: CrawlRequest):
210
  # Create crawler with configuration
211
  async with AsyncWebCrawler() as crawler:
212
  config = CrawlerRunConfig(
213
- cache_mode=cache_mode,
214
  excluded_tags=request.excluded_tags,
215
  remove_overlay_elements=request.remove_overlay_elements,
216
  markdown_generator=md_generator,
@@ -225,15 +225,13 @@ async def crawl_url(request: CrawlRequest):
225
  config=config
226
  )
227
 
228
- # Extract content
229
  markdown = result.markdown_v2.raw_markdown
230
  html = result.html
231
 
232
- # Extract articles and metadata
233
  articles = extract_articles(markdown)
234
  metadata = extract_metadata(markdown, html)
235
 
236
- # Add source URL and subject to metadata
237
  metadata["subject"] = request.subject
238
  for article in articles:
239
  article.source_url = str(request.url)
 
22
 
23
  class CrawlRequest(BaseModel):
24
  url: HttpUrl
25
+ cache_mode: str = "DISABLED"
26
  excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
27
  remove_overlay_elements: bool = True
28
  subject: Optional[str] = None # Optional subject for content filtering
 
185
  @app.post("/crawl", response_model=CrawlResponse)
186
  async def crawl_url(request: CrawlRequest):
187
  try:
188
+ # Force cache mode to DISABLED
189
+ cache_mode = CacheMode.DISABLED
190
 
191
+ # Configure markdown generator
192
  if request.subject:
 
193
  content_filter = BM25ContentFilter(
194
  user_query=request.subject,
195
+ bm25_threshold=1.2,
196
+ min_word_threshold=50
197
  )
198
  else:
 
199
  content_filter = PruningContentFilter(
200
  threshold=0.48,
201
  threshold_type="fixed",
 
210
  # Create crawler with configuration
211
  async with AsyncWebCrawler() as crawler:
212
  config = CrawlerRunConfig(
213
+ cache_mode=cache_mode, # Always DISABLED
214
  excluded_tags=request.excluded_tags,
215
  remove_overlay_elements=request.remove_overlay_elements,
216
  markdown_generator=md_generator,
 
225
  config=config
226
  )
227
 
228
+ # Process results
229
  markdown = result.markdown_v2.raw_markdown
230
  html = result.html
231
 
 
232
  articles = extract_articles(markdown)
233
  metadata = extract_metadata(markdown, html)
234
 
 
235
  metadata["subject"] = request.subject
236
  for article in articles:
237
  article.source_url = str(request.url)