File size: 9,045 Bytes
59bbc6d
 
 
df521e6
 
59bbc6d
 
 
c0e3878
 
 
 
59bbc6d
 
 
 
 
 
 
 
 
 
 
 
32e353a
6c4f9d7
59bbc6d
1091e12
df521e6
59bbc6d
c0e3878
 
 
 
 
 
 
6c4f9d7
c0e3878
59bbc6d
 
 
c0e3878
 
 
 
6c4f9d7
 
 
 
d99bc8b
 
6c4f9d7
d99bc8b
 
 
 
 
 
 
 
 
 
 
 
 
 
6c4f9d7
7622d5e
d99bc8b
6c4f9d7
7622d5e
d99bc8b
7622d5e
d99bc8b
6c4f9d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7622d5e
 
 
 
6c4f9d7
 
 
 
 
 
 
 
 
 
 
c0e3878
 
 
7622d5e
c0e3878
6c4f9d7
8ecfdd5
c0e3878
 
 
8ecfdd5
 
 
 
6c4f9d7
 
 
 
 
 
c0e3878
6c4f9d7
 
c0e3878
 
6c4f9d7
 
 
 
 
 
c0e3878
8ecfdd5
c0e3878
6c4f9d7
8ecfdd5
6c4f9d7
 
 
 
 
 
 
7622d5e
6c4f9d7
 
c0e3878
 
 
6c4f9d7
c0e3878
 
 
 
 
 
 
 
 
 
 
 
 
59bbc6d
 
 
 
32e353a
 
59bbc6d
32e353a
df521e6
 
 
286ca0d
df521e6
 
 
 
 
 
 
 
1091e12
 
 
 
 
 
 
df521e6
 
1091e12
df521e6
 
 
59bbc6d
 
32e353a
59bbc6d
6c4f9d7
df521e6
95dbc93
 
 
 
59bbc6d
 
 
 
 
 
 
32e353a
c0e3878
6c4f9d7
 
c0e3878
6c4f9d7
 
df521e6
6c4f9d7
 
c0e3878
59bbc6d
 
c0e3878
 
 
6c4f9d7
 
 
 
 
59bbc6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, HttpUrl
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
import uvicorn
import asyncio
import nest_asyncio
import re
from typing import Optional, List, Dict
from bs4 import BeautifulSoup
from datetime import datetime

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

app = FastAPI(
    title="Crawl4AI API",
    description="A web API for Crawl4AI web scraping service",
    version="1.0.0"
)

class CrawlRequest(BaseModel):
    url: HttpUrl
    cache_mode: str = "DISABLED"
    excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
    remove_overlay_elements: bool = True
    ignore_links: bool = True
    subject: Optional[str] = None  # Optional subject for content filtering

class Article(BaseModel):
    title: str
    url: str
    description: Optional[str] = None
    image_url: Optional[str] = None
    timestamp: Optional[str] = None
    category: Optional[str] = None
    source_url: Optional[str] = None  # Added to track original source

class CrawlResponse(BaseModel):
    url: str
    success: bool
    error: Optional[str] = None
    metadata: Dict = {}
    articles: List[Article] = []
    raw_markdown: Optional[str] = None
    stats: Dict = {}

def clean_url(url: str) -> str:
    """Clean and normalize URLs"""
    # Remove angle brackets and spaces
    url = url.replace('<', '').replace('>', '').strip()
    
    # Extract domain from the first https:// occurrence
    if url.startswith('https://'):
        domain = url[8:].split('/')[0]
        
        # Remove any duplicate domains
        cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
        cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
        cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
        
        # Ensure proper https:// prefix
        if not cleaned_url.startswith('https://'):
            cleaned_url = f'https://{cleaned_url}'
    else:
        cleaned_url = url
    
    # Remove any markdown formatting or extra parameters
    cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
    
    # Remove any trailing slashes
    cleaned_url = cleaned_url.rstrip('/')
    
    return cleaned_url

def is_valid_title(title: str) -> bool:
    """Check if the title is valid"""
    invalid_patterns = [
        '**_access_time_',
        'existing code',
        '...',
        'navigation',
        'menu',
        'logo'
    ]
    
    # Check for invalid patterns
    if any(pattern in title.lower() for pattern in invalid_patterns):
        return False
        
    # Check if it's likely a filename or URL
    if title.count('-') > 3 or title.count('_') > 2:
        return False
        
    # Check if title is too short
    if len(title.strip()) < 5:
        return False
        
    return True

def clean_description(description: str) -> Optional[str]:
    """Clean and normalize description text"""
    if not description:
        return None
        
    # Remove access_time markers
    if '_access_time_' in description:
        return None
        
    # Remove markdown links
    description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
    
    # Remove URLs
    description = re.sub(r'https?://\S+', '', description)
    
    # Remove special characters and extra whitespace
    description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
    description = ' '.join(description.split())
    
    return description if len(description) > 10 else None

def extract_articles(markdown: str) -> List[Article]:
    articles = []
    seen_urls = set()  # Track unique URLs
    
    # Updated regex pattern
    article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
    matches = re.finditer(article_pattern, markdown, re.DOTALL)
    
    for match in matches:
        title = match.group(2)        # Article title
        url = match.group(3)          # Article URL
        description = match.group(6)   # Description text
        
        # Skip if title is invalid
        if not is_valid_title(title):
            continue
            
        # Clean and validate URL
        url = clean_url(url)
        
        # Skip if URL already processed or is an image
        if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
            continue
            
        seen_urls.add(url)
        
        # Clean description
        clean_desc = clean_description(description)
        
        # Extract image URL if present
        image_url = None
        image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
        if image_match:
            image_url = clean_url(image_match.group(2))
        
        article = Article(
            title=title.strip(),
            url=url,
            description=clean_desc,
            image_url=image_url,
            timestamp=None,
            category=None,
            source_url=None  # Will be set later
        )
        articles.append(article)
    
    return articles

def extract_metadata(markdown: str, html: str) -> Dict:
    metadata = {
        "timestamp": datetime.now().isoformat(),
        "categories": [],
        "total_articles": 0
    }
    
    # Extract categories
    category_pattern = r'##\s+\[(.*?)\]'
    categories = re.findall(category_pattern, markdown)
    if categories:
        metadata["categories"] = [cat.strip() for cat in categories]
    
    return metadata

@app.post("/crawl", response_model=CrawlResponse)
async def crawl_url(request: CrawlRequest):
    try:
        # Force cache mode to DISABLED
        cache_mode = CacheMode.DISABLED
        
        # Configure markdown generator
        if request.subject:
            content_filter = BM25ContentFilter(
                user_query=request.subject,
                bm25_threshold=1.2
            )
        else:
            content_filter = PruningContentFilter(
                threshold=0.48,
                threshold_type="fixed",
                min_word_threshold=50
            )
            
        # Create options dictionary with ignore_images
        options = {"ignore_images": True}
        
        # Add ignore_links if requested
        if request.ignore_links:
            options["ignore_links"] = True
            
        md_generator = DefaultMarkdownGenerator(
            content_filter=content_filter,
            options=options
        )
        
        # Create crawler with configuration
        async with AsyncWebCrawler() as crawler:
            config = CrawlerRunConfig(
                cache_mode=cache_mode,  # Always DISABLED
                excluded_tags=request.excluded_tags,
                remove_overlay_elements=request.remove_overlay_elements,
                markdown_generator=md_generator,
                exclude_external_links=True,
                exclude_social_media_links=True,
                exclude_external_images=True,
                exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
            )
            
            result = await crawler.arun(
                url=str(request.url),
                config=config
            )
            
            # Process results
            markdown = result.markdown_v2.raw_markdown
            html = result.html
            
            articles = extract_articles(markdown)
            metadata = extract_metadata(markdown, html)
            
            metadata["subject"] = request.subject
            for article in articles:
                article.source_url = str(request.url)
            
            return CrawlResponse(
                url=str(request.url),
                success=result.success,
                metadata=metadata,
                articles=articles,
                raw_markdown=markdown if result.success else None,
                stats={
                    "total_links": len(result.links) if result.links else 0,
                    "processing_time": result.processing_time if hasattr(result, 'processing_time') else None
                }
            )
            
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
def read_root():
    return {
        "message": "Welcome to Crawl4AI API",
        "docs": "/docs",
        "redoc": "/redoc"
    }

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)