Spaces:

NightFury2710
/

myCrawl4ai

Sleeping

App Files Files Community

NightFury2710 commited on Jan 29

Commit

6c4f9d7

1 Parent(s): 8ecfdd5

update api handle 3

Browse files

Files changed (1) hide show

app.py +114 -37

app.py CHANGED Viewed

@@ -21,8 +21,10 @@ app = FastAPI(
 class CrawlRequest(BaseModel):
     url: HttpUrl
     cache_mode: str = "ENABLED"
-    excluded_tags: list[str] = ["nav", "footer", "aside"]
     remove_overlay_elements: bool = True
 class Article(BaseModel):
     title: str
@@ -31,6 +33,7 @@ class Article(BaseModel):
     image_url: Optional[str] = None
     timestamp: Optional[str] = None
     category: Optional[str] = None
 class CrawlResponse(BaseModel):
     url: str
@@ -39,58 +42,114 @@ class CrawlResponse(BaseModel):
     metadata: Dict = {}
     articles: List[Article] = []
     raw_markdown: Optional[str] = None
 def extract_articles(markdown: str) -> List[Article]:
     articles = []
-    # Updated regex pattern to better handle markdown links with images
     article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
     matches = re.finditer(article_pattern, markdown, re.DOTALL)
     for match in matches:
-        # Extract components
-        image_title = match.group(1)  # Image alt text if exists
         title = match.group(2)        # Article title
         url = match.group(3)          # Article URL
         description = match.group(6)   # Description text
-        # Clean up the data
-        url = url.replace('<', '').replace('>', '').split(' ')[0]  # Take first URL if multiple
-        # Skip navigation links and other non-article content
-        if any(skip in title.lower() for skip in ['...', 'navigation', 'menu', 'logo', 'existing code']):
             continue
-        # Extract image URL if present in description
         image_url = None
         image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
         if image_match:
-            image_url = image_match.group(2)
-            description = description.replace(image_match.group(0), '').strip()
-        # Clean up description
-        if description:
-            # Remove markdown links from description
-            description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
-            # Remove quotes
-            description = description.replace('"', '').strip()
-            # Remove multiple spaces
-            description = ' '.join(description.split())
-        if title and url and not title.startswith('!'):  # Ensure we have valid title and URL
-            article = Article(
-                title=title.strip(),
-                url=url,
-                description=description if description and len(description) > 3 else None,
-                image_url=image_url,
-                timestamp=None,  # Can be added if timestamp is found in the content
-                category=None    # Can be extracted from URL or content structure
-            )
-            articles.append(article)
     return articles
-def extract_metadata(markdown: str) -> Dict:
     metadata = {
         "timestamp": datetime.now().isoformat(),
         "categories": [],
@@ -108,14 +167,21 @@ def extract_metadata(markdown: str) -> Dict:
 @app.post("/crawl", response_model=CrawlResponse)
 async def crawl_url(request: CrawlRequest):
     try:
-        # Convert cache_mode string to enum
         cache_mode = getattr(CacheMode, request.cache_mode)
         async with AsyncWebCrawler() as crawler:
             config = CrawlerRunConfig(
                 cache_mode=cache_mode,
                 excluded_tags=request.excluded_tags,
-                remove_overlay_elements=request.remove_overlay_elements
             )
             result = await crawler.arun(
@@ -123,18 +189,29 @@ async def crawl_url(request: CrawlRequest):
                 config=config
             )
-            # Extract articles and metadata
             markdown = result.markdown_v2.raw_markdown
             articles = extract_articles(markdown)
-            metadata = extract_metadata(markdown)
-            metadata["total_articles"] = len(articles)
             return CrawlResponse(
                 url=str(request.url),
                 success=result.success,
                 metadata=metadata,
                 articles=articles,
-                raw_markdown=markdown if result.success else None
             )
     except Exception as e:

 class CrawlRequest(BaseModel):
     url: HttpUrl
     cache_mode: str = "ENABLED"
+    excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
     remove_overlay_elements: bool = True
+    max_pages: int = 1  # Limit number of pages to crawl
+    timeout: int = 30   # Timeout in seconds
 class Article(BaseModel):
     title: str
     image_url: Optional[str] = None
     timestamp: Optional[str] = None
     category: Optional[str] = None
+    source_url: Optional[str] = None  # Added to track original source
 class CrawlResponse(BaseModel):
     url: str
     metadata: Dict = {}
     articles: List[Article] = []
     raw_markdown: Optional[str] = None
+    stats: Dict = {}
+def clean_url(url: str) -> str:
+    """Clean and normalize URLs"""
+    # Remove angle brackets and extra domains
+    url = url.replace('<', '').replace('>', '')
+    # Fix double domain issues
+    if 'https://' in url[8:]:  # Check after first https://
+        url = url.replace('https://', '', 1)  # Remove first occurrence
+    # Remove any markdown or text formatting
+    url = url.split(' ')[0].split(')')[0]
+    return url
+def is_valid_title(title: str) -> bool:
+    """Check if the title is valid"""
+    invalid_patterns = [
+        '**_access_time_',
+        'existing code',
+        '...',
+        'navigation',
+        'menu',
+        'logo'
+    ]
+    # Check for invalid patterns
+    if any(pattern in title.lower() for pattern in invalid_patterns):
+        return False
+    # Check if it's likely a filename or URL
+    if title.count('-') > 3 or title.count('_') > 2:
+        return False
+    # Check if title is too short
+    if len(title.strip()) < 5:
+        return False
+    return True
+def clean_description(description: str) -> Optional[str]:
+    """Clean and normalize description text"""
+    if not description:
+        return None
+    # Remove markdown links
+    description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
+    # Remove URLs
+    description = re.sub(r'https?://\S+', '', description)
+    # Remove special characters and extra whitespace
+    description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
+    description = ' '.join(description.split())
+    return description if len(description) > 10 else None
 def extract_articles(markdown: str) -> List[Article]:
     articles = []
+    # Updated regex pattern
     article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
     matches = re.finditer(article_pattern, markdown, re.DOTALL)
+    seen_urls = set()  # Track unique URLs
     for match in matches:
         title = match.group(2)        # Article title
         url = match.group(3)          # Article URL
         description = match.group(6)   # Description text
+        # Skip if title is invalid
+        if not is_valid_title(title):
+            continue
+        # Clean and validate URL
+        url = clean_url(url)
+        # Skip if URL already processed or is an image
+        if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
             continue
+        seen_urls.add(url)
+        # Clean description
+        clean_desc = clean_description(description)
+        # Extract image URL if present
         image_url = None
         image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
         if image_match:
+            image_url = clean_url(image_match.group(2))
+        article = Article(
+            title=title.strip(),
+            url=url,
+            description=clean_desc,
+            image_url=image_url,
+            timestamp=None,
+            category=None,
+            source_url=None
+        )
+        articles.append(article)
     return articles
+def extract_metadata(markdown: str, html: str) -> Dict:
     metadata = {
         "timestamp": datetime.now().isoformat(),
         "categories": [],
 @app.post("/crawl", response_model=CrawlResponse)
 async def crawl_url(request: CrawlRequest):
     try:
         cache_mode = getattr(CacheMode, request.cache_mode)
+        # Create crawler with improved configuration
         async with AsyncWebCrawler() as crawler:
             config = CrawlerRunConfig(
                 cache_mode=cache_mode,
                 excluded_tags=request.excluded_tags,
+                remove_overlay_elements=request.remove_overlay_elements,
+                max_pages=request.max_pages,
+                timeout=request.timeout,
+                # Added from quickstart examples
+                remove_ads=True,
+                extract_text=True,
+                extract_links=True,
+                extract_images=True
             )
             result = await crawler.arun(
                 config=config
             )
+            # Use both markdown and HTML results for better extraction
             markdown = result.markdown_v2.raw_markdown
+            html = result.html
+            # Extract content using both markdown and HTML
             articles = extract_articles(markdown)
+            metadata = extract_metadata(markdown, html)
+            # Add source URL to articles
+            for article in articles:
+                article.source_url = str(request.url)
             return CrawlResponse(
                 url=str(request.url),
                 success=result.success,
                 metadata=metadata,
                 articles=articles,
+                raw_markdown=markdown if result.success else None,
+                stats={
+                    "total_links": len(result.links) if result.links else 0,
+                    "total_images": len(result.images) if result.images else 0,
+                    "processing_time": result.processing_time if hasattr(result, 'processing_time') else None
+                }
             )
     except Exception as e: