Commit
·
32e353a
1
Parent(s):
9a5fc54
update api handle 3
Browse files
app.py
CHANGED
@@ -22,7 +22,7 @@ app = FastAPI(
|
|
22 |
|
23 |
class CrawlRequest(BaseModel):
|
24 |
url: HttpUrl
|
25 |
-
cache_mode: str = "
|
26 |
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
27 |
remove_overlay_elements: bool = True
|
28 |
subject: Optional[str] = None # Optional subject for content filtering
|
@@ -185,17 +185,17 @@ def extract_metadata(markdown: str, html: str) -> Dict:
|
|
185 |
@app.post("/crawl", response_model=CrawlResponse)
|
186 |
async def crawl_url(request: CrawlRequest):
|
187 |
try:
|
188 |
-
|
|
|
189 |
|
190 |
-
# Configure markdown generator
|
191 |
if request.subject:
|
192 |
-
# Use BM25 filter when subject is provided use_stemming=True
|
193 |
content_filter = BM25ContentFilter(
|
194 |
user_query=request.subject,
|
195 |
-
bm25_threshold=1.2
|
|
|
196 |
)
|
197 |
else:
|
198 |
-
# Use default pruning filter when no subject
|
199 |
content_filter = PruningContentFilter(
|
200 |
threshold=0.48,
|
201 |
threshold_type="fixed",
|
@@ -210,7 +210,7 @@ async def crawl_url(request: CrawlRequest):
|
|
210 |
# Create crawler with configuration
|
211 |
async with AsyncWebCrawler() as crawler:
|
212 |
config = CrawlerRunConfig(
|
213 |
-
cache_mode=cache_mode,
|
214 |
excluded_tags=request.excluded_tags,
|
215 |
remove_overlay_elements=request.remove_overlay_elements,
|
216 |
markdown_generator=md_generator,
|
@@ -225,15 +225,13 @@ async def crawl_url(request: CrawlRequest):
|
|
225 |
config=config
|
226 |
)
|
227 |
|
228 |
-
#
|
229 |
markdown = result.markdown_v2.raw_markdown
|
230 |
html = result.html
|
231 |
|
232 |
-
# Extract articles and metadata
|
233 |
articles = extract_articles(markdown)
|
234 |
metadata = extract_metadata(markdown, html)
|
235 |
|
236 |
-
# Add source URL and subject to metadata
|
237 |
metadata["subject"] = request.subject
|
238 |
for article in articles:
|
239 |
article.source_url = str(request.url)
|
|
|
22 |
|
23 |
class CrawlRequest(BaseModel):
|
24 |
url: HttpUrl
|
25 |
+
cache_mode: str = "DISABLED"
|
26 |
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
27 |
remove_overlay_elements: bool = True
|
28 |
subject: Optional[str] = None # Optional subject for content filtering
|
|
|
185 |
@app.post("/crawl", response_model=CrawlResponse)
|
186 |
async def crawl_url(request: CrawlRequest):
|
187 |
try:
|
188 |
+
# Force cache mode to DISABLED
|
189 |
+
cache_mode = CacheMode.DISABLED
|
190 |
|
191 |
+
# Configure markdown generator
|
192 |
if request.subject:
|
|
|
193 |
content_filter = BM25ContentFilter(
|
194 |
user_query=request.subject,
|
195 |
+
bm25_threshold=1.2,
|
196 |
+
min_word_threshold=50
|
197 |
)
|
198 |
else:
|
|
|
199 |
content_filter = PruningContentFilter(
|
200 |
threshold=0.48,
|
201 |
threshold_type="fixed",
|
|
|
210 |
# Create crawler with configuration
|
211 |
async with AsyncWebCrawler() as crawler:
|
212 |
config = CrawlerRunConfig(
|
213 |
+
cache_mode=cache_mode, # Always DISABLED
|
214 |
excluded_tags=request.excluded_tags,
|
215 |
remove_overlay_elements=request.remove_overlay_elements,
|
216 |
markdown_generator=md_generator,
|
|
|
225 |
config=config
|
226 |
)
|
227 |
|
228 |
+
# Process results
|
229 |
markdown = result.markdown_v2.raw_markdown
|
230 |
html = result.html
|
231 |
|
|
|
232 |
articles = extract_articles(markdown)
|
233 |
metadata = extract_metadata(markdown, html)
|
234 |
|
|
|
235 |
metadata["subject"] = request.subject
|
236 |
for article in articles:
|
237 |
article.source_url = str(request.url)
|