Spaces:
Running
Running
File size: 9,045 Bytes
59bbc6d df521e6 59bbc6d c0e3878 59bbc6d 32e353a 6c4f9d7 59bbc6d 1091e12 df521e6 59bbc6d c0e3878 6c4f9d7 c0e3878 59bbc6d c0e3878 6c4f9d7 d99bc8b 6c4f9d7 d99bc8b 6c4f9d7 7622d5e d99bc8b 6c4f9d7 7622d5e d99bc8b 7622d5e d99bc8b 6c4f9d7 7622d5e 6c4f9d7 c0e3878 7622d5e c0e3878 6c4f9d7 8ecfdd5 c0e3878 8ecfdd5 6c4f9d7 c0e3878 6c4f9d7 c0e3878 6c4f9d7 c0e3878 8ecfdd5 c0e3878 6c4f9d7 8ecfdd5 6c4f9d7 7622d5e 6c4f9d7 c0e3878 6c4f9d7 c0e3878 59bbc6d 32e353a 59bbc6d 32e353a df521e6 286ca0d df521e6 1091e12 df521e6 1091e12 df521e6 59bbc6d 32e353a 59bbc6d 6c4f9d7 df521e6 95dbc93 59bbc6d 32e353a c0e3878 6c4f9d7 c0e3878 6c4f9d7 df521e6 6c4f9d7 c0e3878 59bbc6d c0e3878 6c4f9d7 59bbc6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, HttpUrl
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
import uvicorn
import asyncio
import nest_asyncio
import re
from typing import Optional, List, Dict
from bs4 import BeautifulSoup
from datetime import datetime
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
app = FastAPI(
title="Crawl4AI API",
description="A web API for Crawl4AI web scraping service",
version="1.0.0"
)
class CrawlRequest(BaseModel):
url: HttpUrl
cache_mode: str = "DISABLED"
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
remove_overlay_elements: bool = True
ignore_links: bool = True
subject: Optional[str] = None # Optional subject for content filtering
class Article(BaseModel):
title: str
url: str
description: Optional[str] = None
image_url: Optional[str] = None
timestamp: Optional[str] = None
category: Optional[str] = None
source_url: Optional[str] = None # Added to track original source
class CrawlResponse(BaseModel):
url: str
success: bool
error: Optional[str] = None
metadata: Dict = {}
articles: List[Article] = []
raw_markdown: Optional[str] = None
stats: Dict = {}
def clean_url(url: str) -> str:
"""Clean and normalize URLs"""
# Remove angle brackets and spaces
url = url.replace('<', '').replace('>', '').strip()
# Extract domain from the first https:// occurrence
if url.startswith('https://'):
domain = url[8:].split('/')[0]
# Remove any duplicate domains
cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
# Ensure proper https:// prefix
if not cleaned_url.startswith('https://'):
cleaned_url = f'https://{cleaned_url}'
else:
cleaned_url = url
# Remove any markdown formatting or extra parameters
cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
# Remove any trailing slashes
cleaned_url = cleaned_url.rstrip('/')
return cleaned_url
def is_valid_title(title: str) -> bool:
"""Check if the title is valid"""
invalid_patterns = [
'**_access_time_',
'existing code',
'...',
'navigation',
'menu',
'logo'
]
# Check for invalid patterns
if any(pattern in title.lower() for pattern in invalid_patterns):
return False
# Check if it's likely a filename or URL
if title.count('-') > 3 or title.count('_') > 2:
return False
# Check if title is too short
if len(title.strip()) < 5:
return False
return True
def clean_description(description: str) -> Optional[str]:
"""Clean and normalize description text"""
if not description:
return None
# Remove access_time markers
if '_access_time_' in description:
return None
# Remove markdown links
description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
# Remove URLs
description = re.sub(r'https?://\S+', '', description)
# Remove special characters and extra whitespace
description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
description = ' '.join(description.split())
return description if len(description) > 10 else None
def extract_articles(markdown: str) -> List[Article]:
articles = []
seen_urls = set() # Track unique URLs
# Updated regex pattern
article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
matches = re.finditer(article_pattern, markdown, re.DOTALL)
for match in matches:
title = match.group(2) # Article title
url = match.group(3) # Article URL
description = match.group(6) # Description text
# Skip if title is invalid
if not is_valid_title(title):
continue
# Clean and validate URL
url = clean_url(url)
# Skip if URL already processed or is an image
if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
continue
seen_urls.add(url)
# Clean description
clean_desc = clean_description(description)
# Extract image URL if present
image_url = None
image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
if image_match:
image_url = clean_url(image_match.group(2))
article = Article(
title=title.strip(),
url=url,
description=clean_desc,
image_url=image_url,
timestamp=None,
category=None,
source_url=None # Will be set later
)
articles.append(article)
return articles
def extract_metadata(markdown: str, html: str) -> Dict:
metadata = {
"timestamp": datetime.now().isoformat(),
"categories": [],
"total_articles": 0
}
# Extract categories
category_pattern = r'##\s+\[(.*?)\]'
categories = re.findall(category_pattern, markdown)
if categories:
metadata["categories"] = [cat.strip() for cat in categories]
return metadata
@app.post("/crawl", response_model=CrawlResponse)
async def crawl_url(request: CrawlRequest):
try:
# Force cache mode to DISABLED
cache_mode = CacheMode.DISABLED
# Configure markdown generator
if request.subject:
content_filter = BM25ContentFilter(
user_query=request.subject,
bm25_threshold=1.2
)
else:
content_filter = PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=50
)
# Create options dictionary with ignore_images
options = {"ignore_images": True}
# Add ignore_links if requested
if request.ignore_links:
options["ignore_links"] = True
md_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
options=options
)
# Create crawler with configuration
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
cache_mode=cache_mode, # Always DISABLED
excluded_tags=request.excluded_tags,
remove_overlay_elements=request.remove_overlay_elements,
markdown_generator=md_generator,
exclude_external_links=True,
exclude_social_media_links=True,
exclude_external_images=True,
exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
)
result = await crawler.arun(
url=str(request.url),
config=config
)
# Process results
markdown = result.markdown_v2.raw_markdown
html = result.html
articles = extract_articles(markdown)
metadata = extract_metadata(markdown, html)
metadata["subject"] = request.subject
for article in articles:
article.source_url = str(request.url)
return CrawlResponse(
url=str(request.url),
success=result.success,
metadata=metadata,
articles=articles,
raw_markdown=markdown if result.success else None,
stats={
"total_links": len(result.links) if result.links else 0,
"processing_time": result.processing_time if hasattr(result, 'processing_time') else None
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
def read_root():
return {
"message": "Welcome to Crawl4AI API",
"docs": "/docs",
"redoc": "/redoc"
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |