myCrawl4ai / app.py
NightFury2710's picture
update api handle 3
1091e12
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, HttpUrl
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
import uvicorn
import asyncio
import nest_asyncio
import re
from typing import Optional, List, Dict
from bs4 import BeautifulSoup
from datetime import datetime
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
app = FastAPI(
title="Crawl4AI API",
description="A web API for Crawl4AI web scraping service",
version="1.0.0"
)
class CrawlRequest(BaseModel):
url: HttpUrl
cache_mode: str = "DISABLED"
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
remove_overlay_elements: bool = True
ignore_links: bool = True
subject: Optional[str] = None # Optional subject for content filtering
class Article(BaseModel):
title: str
url: str
description: Optional[str] = None
image_url: Optional[str] = None
timestamp: Optional[str] = None
category: Optional[str] = None
source_url: Optional[str] = None # Added to track original source
class CrawlResponse(BaseModel):
url: str
success: bool
error: Optional[str] = None
metadata: Dict = {}
articles: List[Article] = []
raw_markdown: Optional[str] = None
stats: Dict = {}
def clean_url(url: str) -> str:
"""Clean and normalize URLs"""
# Remove angle brackets and spaces
url = url.replace('<', '').replace('>', '').strip()
# Extract domain from the first https:// occurrence
if url.startswith('https://'):
domain = url[8:].split('/')[0]
# Remove any duplicate domains
cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
# Ensure proper https:// prefix
if not cleaned_url.startswith('https://'):
cleaned_url = f'https://{cleaned_url}'
else:
cleaned_url = url
# Remove any markdown formatting or extra parameters
cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
# Remove any trailing slashes
cleaned_url = cleaned_url.rstrip('/')
return cleaned_url
def is_valid_title(title: str) -> bool:
"""Check if the title is valid"""
invalid_patterns = [
'**_access_time_',
'existing code',
'...',
'navigation',
'menu',
'logo'
]
# Check for invalid patterns
if any(pattern in title.lower() for pattern in invalid_patterns):
return False
# Check if it's likely a filename or URL
if title.count('-') > 3 or title.count('_') > 2:
return False
# Check if title is too short
if len(title.strip()) < 5:
return False
return True
def clean_description(description: str) -> Optional[str]:
"""Clean and normalize description text"""
if not description:
return None
# Remove access_time markers
if '_access_time_' in description:
return None
# Remove markdown links
description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
# Remove URLs
description = re.sub(r'https?://\S+', '', description)
# Remove special characters and extra whitespace
description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
description = ' '.join(description.split())
return description if len(description) > 10 else None
def extract_articles(markdown: str) -> List[Article]:
articles = []
seen_urls = set() # Track unique URLs
# Updated regex pattern
article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
matches = re.finditer(article_pattern, markdown, re.DOTALL)
for match in matches:
title = match.group(2) # Article title
url = match.group(3) # Article URL
description = match.group(6) # Description text
# Skip if title is invalid
if not is_valid_title(title):
continue
# Clean and validate URL
url = clean_url(url)
# Skip if URL already processed or is an image
if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
continue
seen_urls.add(url)
# Clean description
clean_desc = clean_description(description)
# Extract image URL if present
image_url = None
image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
if image_match:
image_url = clean_url(image_match.group(2))
article = Article(
title=title.strip(),
url=url,
description=clean_desc,
image_url=image_url,
timestamp=None,
category=None,
source_url=None # Will be set later
)
articles.append(article)
return articles
def extract_metadata(markdown: str, html: str) -> Dict:
metadata = {
"timestamp": datetime.now().isoformat(),
"categories": [],
"total_articles": 0
}
# Extract categories
category_pattern = r'##\s+\[(.*?)\]'
categories = re.findall(category_pattern, markdown)
if categories:
metadata["categories"] = [cat.strip() for cat in categories]
return metadata
@app.post("/crawl", response_model=CrawlResponse)
async def crawl_url(request: CrawlRequest):
try:
# Force cache mode to DISABLED
cache_mode = CacheMode.DISABLED
# Configure markdown generator
if request.subject:
content_filter = BM25ContentFilter(
user_query=request.subject,
bm25_threshold=1.2
)
else:
content_filter = PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=50
)
# Create options dictionary with ignore_images
options = {"ignore_images": True}
# Add ignore_links if requested
if request.ignore_links:
options["ignore_links"] = True
md_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
options=options
)
# Create crawler with configuration
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
cache_mode=cache_mode, # Always DISABLED
excluded_tags=request.excluded_tags,
remove_overlay_elements=request.remove_overlay_elements,
markdown_generator=md_generator,
exclude_external_links=True,
exclude_social_media_links=True,
exclude_external_images=True,
exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
)
result = await crawler.arun(
url=str(request.url),
config=config
)
# Process results
markdown = result.markdown_v2.raw_markdown
html = result.html
articles = extract_articles(markdown)
metadata = extract_metadata(markdown, html)
metadata["subject"] = request.subject
for article in articles:
article.source_url = str(request.url)
return CrawlResponse(
url=str(request.url),
success=result.success,
metadata=metadata,
articles=articles,
raw_markdown=markdown if result.success else None,
stats={
"total_links": len(result.links) if result.links else 0,
"processing_time": result.processing_time if hasattr(result, 'processing_time') else None
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
def read_root():
return {
"message": "Welcome to Crawl4AI API",
"docs": "/docs",
"redoc": "/redoc"
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)