myCrawl4ai

Running

File size: 9,045 Bytes

59bbc6d
 
 
df521e6
 
59bbc6d
 
 
c0e3878
 
 
 
59bbc6d
 
 
 
 
 
 
 
 
 
 
 
32e353a
6c4f9d7
59bbc6d
1091e12
df521e6
59bbc6d
c0e3878
 
 
 
 
 
 
6c4f9d7
c0e3878
59bbc6d
 
 
c0e3878
 
 
 
6c4f9d7
 
 
 
d99bc8b
 
6c4f9d7
d99bc8b
 
 
 
 
 
 
 
 
 
 
 
 
 
6c4f9d7
7622d5e
d99bc8b
6c4f9d7
7622d5e
d99bc8b
7622d5e
d99bc8b
6c4f9d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7622d5e
 
 
 
6c4f9d7
 
 
 
 
 
 
 
 
 
 
c0e3878
 
 
7622d5e
c0e3878
6c4f9d7
8ecfdd5
c0e3878
 
 
8ecfdd5
 
 
 
6c4f9d7
 
 
 
 
 
c0e3878
6c4f9d7
 
c0e3878
 
6c4f9d7
 
 
 
 
 
c0e3878
8ecfdd5
c0e3878
6c4f9d7
8ecfdd5
6c4f9d7
 
 
 
 
 
 
7622d5e
6c4f9d7
 
c0e3878
 
 
6c4f9d7
c0e3878
 
 
 
 
 
 
 
 
 
 
 
 
59bbc6d
 
 
 
32e353a
 
59bbc6d
32e353a
df521e6
 
 
286ca0d
df521e6
 
 
 
 
 
 
 
1091e12
 
 
 
 
 
 
df521e6
 
1091e12
df521e6
 
 
59bbc6d
 
32e353a
59bbc6d
6c4f9d7
df521e6
95dbc93
 
 
 
59bbc6d
 
 
 
 
 
 
32e353a
c0e3878
6c4f9d7
 
c0e3878
6c4f9d7
 
df521e6
6c4f9d7
 
c0e3878
59bbc6d
 
c0e3878
 
 
6c4f9d7
 
 
 
 
59bbc6d

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, HttpUrl
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
import uvicorn
import asyncio
import nest_asyncio
import re
from typing import Optional, List, Dict
from bs4 import BeautifulSoup
from datetime import datetime

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

app = FastAPI(
    title="Crawl4AI API",
    description="A web API for Crawl4AI web scraping service",
    version="1.0.0"
)

class CrawlRequest(BaseModel):
    url: HttpUrl
    cache_mode: str = "DISABLED"
    excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
    remove_overlay_elements: bool = True
    ignore_links: bool = True
    subject: Optional[str] = None  # Optional subject for content filtering

class Article(BaseModel):
    title: str
    url: str
    description: Optional[str] = None
    image_url: Optional[str] = None
    timestamp: Optional[str] = None
    category: Optional[str] = None
    source_url: Optional[str] = None  # Added to track original source

class CrawlResponse(BaseModel):
    url: str
    success: bool
    error: Optional[str] = None
    metadata: Dict = {}
    articles: List[Article] = []
    raw_markdown: Optional[str] = None
    stats: Dict = {}

def clean_url(url: str) -> str:
    """Clean and normalize URLs"""
    # Remove angle brackets and spaces
    url = url.replace('<', '').replace('>', '').strip()
    
    # Extract domain from the first https:// occurrence
    if url.startswith('https://'):
        domain = url[8:].split('/')[0]
        
        # Remove any duplicate domains
        cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
        cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
        cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
        
        # Ensure proper https:// prefix
        if not cleaned_url.startswith('https://'):
            cleaned_url = f'https://{cleaned_url}'
    else:
        cleaned_url = url
    
    # Remove any markdown formatting or extra parameters
    cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
    
    # Remove any trailing slashes
    cleaned_url = cleaned_url.rstrip('/')
    
    return cleaned_url

def is_valid_title(title: str) -> bool:
    """Check if the title is valid"""
    invalid_patterns = [
        '**_access_time_',
        'existing code',
        '...',
        'navigation',
        'menu',
        'logo'
    ]
    
    # Check for invalid patterns
    if any(pattern in title.lower() for pattern in invalid_patterns):
        return False
        
    # Check if it's likely a filename or URL
    if title.count('-') > 3 or title.count('_') > 2:
        return False
        
    # Check if title is too short
    if len(title.strip()) < 5:
        return False
        
    return True

def clean_description(description: str) -> Optional[str]:
    """Clean and normalize description text"""
    if not description:
        return None
        
    # Remove access_time markers
    if '_access_time_' in description:
        return None
        
    # Remove markdown links
    description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
    
    # Remove URLs
    description = re.sub(r'https?://\S+', '', description)
    
    # Remove special characters and extra whitespace
    description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
    description = ' '.join(description.split())
    
    return description if len(description) > 10 else None

def extract_articles(markdown: str) -> List[Article]:
    articles = []
    seen_urls = set()  # Track unique URLs
    
    # Updated regex pattern
    article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
    matches = re.finditer(article_pattern, markdown, re.DOTALL)
    
    for match in matches:
        title = match.group(2)        # Article title
        url = match.group(3)          # Article URL
        description = match.group(6)   # Description text
        
        # Skip if title is invalid
        if not is_valid_title(title):
            continue
            
        # Clean and validate URL
        url = clean_url(url)
        
        # Skip if URL already processed or is an image
        if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
            continue
            
        seen_urls.add(url)
        
        # Clean description
        clean_desc = clean_description(description)
        
        # Extract image URL if present
        image_url = None
        image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
        if image_match:
            image_url = clean_url(image_match.group(2))
        
        article = Article(
            title=title.strip(),
            url=url,
            description=clean_desc,
            image_url=image_url,
            timestamp=None,
            category=None,
            source_url=None  # Will be set later
        )
        articles.append(article)
    
    return articles

def extract_metadata(markdown: str, html: str) -> Dict:
    metadata = {
        "timestamp": datetime.now().isoformat(),
        "categories": [],
        "total_articles": 0
    }
    
    # Extract categories
    category_pattern = r'##\s+\[(.*?)\]'
    categories = re.findall(category_pattern, markdown)
    if categories:
        metadata["categories"] = [cat.strip() for cat in categories]
    
    return metadata

@app.post("/crawl", response_model=CrawlResponse)
async def crawl_url(request: CrawlRequest):
    try:
        # Force cache mode to DISABLED
        cache_mode = CacheMode.DISABLED
        
        # Configure markdown generator
        if request.subject:
            content_filter = BM25ContentFilter(
                user_query=request.subject,
                bm25_threshold=1.2
            )
        else:
            content_filter = PruningContentFilter(
                threshold=0.48,
                threshold_type="fixed",
                min_word_threshold=50
            )
            
        # Create options dictionary with ignore_images
        options = {"ignore_images": True}
        
        # Add ignore_links if requested
        if request.ignore_links:
            options["ignore_links"] = True
            
        md_generator = DefaultMarkdownGenerator(
            content_filter=content_filter,
            options=options
        )
        
        # Create crawler with configuration
        async with AsyncWebCrawler() as crawler:
            config = CrawlerRunConfig(
                cache_mode=cache_mode,  # Always DISABLED
                excluded_tags=request.excluded_tags,
                remove_overlay_elements=request.remove_overlay_elements,
                markdown_generator=md_generator,
                exclude_external_links=True,
                exclude_social_media_links=True,
                exclude_external_images=True,
                exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
            )
            
            result = await crawler.arun(
                url=str(request.url),
                config=config
            )
            
            # Process results
            markdown = result.markdown_v2.raw_markdown
            html = result.html
            
            articles = extract_articles(markdown)
            metadata = extract_metadata(markdown, html)
            
            metadata["subject"] = request.subject
            for article in articles:
                article.source_url = str(request.url)
            
            return CrawlResponse(
                url=str(request.url),
                success=result.success,
                metadata=metadata,
                articles=articles,
                raw_markdown=markdown if result.success else None,
                stats={
                    "total_links": len(result.links) if result.links else 0,
                    "processing_time": result.processing_time if hasattr(result, 'processing_time') else None
                }
            )
            
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
def read_root():
    return {
        "message": "Welcome to Crawl4AI API",
        "docs": "/docs",
        "redoc": "/redoc"
    }

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)