NightFury2710 commited on
Commit
c0e3878
·
1 Parent(s): f300b39

update api handle 3

Browse files
Files changed (2) hide show
  1. app.py +77 -4
  2. requirements.txt +8 -6
app.py CHANGED
@@ -4,6 +4,10 @@ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
4
  import uvicorn
5
  import asyncio
6
  import nest_asyncio
 
 
 
 
7
 
8
  # Apply nest_asyncio to allow nested event loops
9
  nest_asyncio.apply()
@@ -20,11 +24,72 @@ class CrawlRequest(BaseModel):
20
  excluded_tags: list[str] = ["nav", "footer", "aside"]
21
  remove_overlay_elements: bool = True
22
 
 
 
 
 
 
 
 
 
23
  class CrawlResponse(BaseModel):
24
  url: str
25
- markdown: str
26
  success: bool
27
- error: str = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  @app.post("/crawl", response_model=CrawlResponse)
30
  async def crawl_url(request: CrawlRequest):
@@ -44,10 +109,18 @@ async def crawl_url(request: CrawlRequest):
44
  config=config
45
  )
46
 
 
 
 
 
 
 
47
  return CrawlResponse(
48
  url=str(request.url),
49
- markdown=result.markdown_v2.raw_markdown,
50
- success=result.success
 
 
51
  )
52
 
53
  except Exception as e:
 
4
  import uvicorn
5
  import asyncio
6
  import nest_asyncio
7
+ import re
8
+ from typing import Optional, List, Dict
9
+ from bs4 import BeautifulSoup
10
+ from datetime import datetime
11
 
12
  # Apply nest_asyncio to allow nested event loops
13
  nest_asyncio.apply()
 
24
  excluded_tags: list[str] = ["nav", "footer", "aside"]
25
  remove_overlay_elements: bool = True
26
 
27
+ class Article(BaseModel):
28
+ title: str
29
+ url: str
30
+ description: Optional[str] = None
31
+ image_url: Optional[str] = None
32
+ timestamp: Optional[str] = None
33
+ category: Optional[str] = None
34
+
35
  class CrawlResponse(BaseModel):
36
  url: str
 
37
  success: bool
38
+ error: Optional[str] = None
39
+ metadata: Dict = {}
40
+ articles: List[Article] = []
41
+ raw_markdown: Optional[str] = None
42
+
43
+ def extract_articles(markdown: str) -> List[Article]:
44
+ articles = []
45
+
46
+ # Extract articles using regex
47
+ article_pattern = r'\[(.*?)\]\((.*?)\)(.*?)(?=\[|$)'
48
+ matches = re.finditer(article_pattern, markdown, re.DOTALL)
49
+
50
+ for match in matches:
51
+ title = match.group(1).strip()
52
+ url = match.group(2).replace('<', '').replace('>', '')
53
+ description = match.group(3).strip()
54
+
55
+ # Skip navigation links and other non-article content
56
+ if any(skip in title.lower() for skip in ['...', 'navigation', 'menu', 'logo', 'existing code']):
57
+ continue
58
+
59
+ # Extract image URL if present
60
+ image_url = None
61
+ image_match = re.search(r'!\[(.*?)\]\((.*?)\)', description)
62
+ if image_match:
63
+ image_url = image_match.group(2)
64
+ description = description.replace(image_match.group(0), '').strip()
65
+
66
+ # Clean up description
67
+ description = re.sub(r'\[(.*?)\]\((.*?)\)', '', description).strip()
68
+ if description and len(description) > 3: # Only include if description is meaningful
69
+ article = Article(
70
+ title=title,
71
+ url=url,
72
+ description=description,
73
+ image_url=image_url
74
+ )
75
+ articles.append(article)
76
+
77
+ return articles
78
+
79
+ def extract_metadata(markdown: str) -> Dict:
80
+ metadata = {
81
+ "timestamp": datetime.now().isoformat(),
82
+ "categories": [],
83
+ "total_articles": 0
84
+ }
85
+
86
+ # Extract categories
87
+ category_pattern = r'##\s+\[(.*?)\]'
88
+ categories = re.findall(category_pattern, markdown)
89
+ if categories:
90
+ metadata["categories"] = [cat.strip() for cat in categories]
91
+
92
+ return metadata
93
 
94
  @app.post("/crawl", response_model=CrawlResponse)
95
  async def crawl_url(request: CrawlRequest):
 
109
  config=config
110
  )
111
 
112
+ # Extract articles and metadata
113
+ markdown = result.markdown_v2.raw_markdown
114
+ articles = extract_articles(markdown)
115
+ metadata = extract_metadata(markdown)
116
+ metadata["total_articles"] = len(articles)
117
+
118
  return CrawlResponse(
119
  url=str(request.url),
120
+ success=result.success,
121
+ metadata=metadata,
122
+ articles=articles,
123
+ raw_markdown=markdown if result.success else None
124
  )
125
 
126
  except Exception as e:
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
- fastapi
2
- uvicorn
3
- crawl4ai
4
- nest-asyncio
5
- pydantic
6
- python-multipart
 
 
 
1
+ fastapi>=0.109.0
2
+ uvicorn>=0.27.0
3
+ pydantic>=2.5.3
4
+ beautifulsoup4>=4.12.0
5
+ crawl4ai>=0.1.0
6
+ nest-asyncio>=1.6.0
7
+ python-multipart>=0.0.6
8
+ typing-extensions>=4.9.0