NightFury2710 commited on
Commit
6c4f9d7
·
1 Parent(s): 8ecfdd5

update api handle 3

Browse files
Files changed (1) hide show
  1. app.py +114 -37
app.py CHANGED
@@ -21,8 +21,10 @@ app = FastAPI(
21
  class CrawlRequest(BaseModel):
22
  url: HttpUrl
23
  cache_mode: str = "ENABLED"
24
- excluded_tags: list[str] = ["nav", "footer", "aside"]
25
  remove_overlay_elements: bool = True
 
 
26
 
27
  class Article(BaseModel):
28
  title: str
@@ -31,6 +33,7 @@ class Article(BaseModel):
31
  image_url: Optional[str] = None
32
  timestamp: Optional[str] = None
33
  category: Optional[str] = None
 
34
 
35
  class CrawlResponse(BaseModel):
36
  url: str
@@ -39,58 +42,114 @@ class CrawlResponse(BaseModel):
39
  metadata: Dict = {}
40
  articles: List[Article] = []
41
  raw_markdown: Optional[str] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def extract_articles(markdown: str) -> List[Article]:
44
  articles = []
45
 
46
- # Updated regex pattern to better handle markdown links with images
47
  article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
48
  matches = re.finditer(article_pattern, markdown, re.DOTALL)
49
 
 
 
50
  for match in matches:
51
- # Extract components
52
- image_title = match.group(1) # Image alt text if exists
53
  title = match.group(2) # Article title
54
  url = match.group(3) # Article URL
55
  description = match.group(6) # Description text
56
 
57
- # Clean up the data
58
- url = url.replace('<', '').replace('>', '').split(' ')[0] # Take first URL if multiple
 
 
 
 
59
 
60
- # Skip navigation links and other non-article content
61
- if any(skip in title.lower() for skip in ['...', 'navigation', 'menu', 'logo', 'existing code']):
62
  continue
63
 
64
- # Extract image URL if present in description
 
 
 
 
 
65
  image_url = None
66
  image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
67
  if image_match:
68
- image_url = image_match.group(2)
69
- description = description.replace(image_match.group(0), '').strip()
70
-
71
- # Clean up description
72
- if description:
73
- # Remove markdown links from description
74
- description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
75
- # Remove quotes
76
- description = description.replace('"', '').strip()
77
- # Remove multiple spaces
78
- description = ' '.join(description.split())
79
 
80
- if title and url and not title.startswith('!'): # Ensure we have valid title and URL
81
- article = Article(
82
- title=title.strip(),
83
- url=url,
84
- description=description if description and len(description) > 3 else None,
85
- image_url=image_url,
86
- timestamp=None, # Can be added if timestamp is found in the content
87
- category=None # Can be extracted from URL or content structure
88
- )
89
- articles.append(article)
90
 
91
  return articles
92
 
93
- def extract_metadata(markdown: str) -> Dict:
94
  metadata = {
95
  "timestamp": datetime.now().isoformat(),
96
  "categories": [],
@@ -108,14 +167,21 @@ def extract_metadata(markdown: str) -> Dict:
108
  @app.post("/crawl", response_model=CrawlResponse)
109
  async def crawl_url(request: CrawlRequest):
110
  try:
111
- # Convert cache_mode string to enum
112
  cache_mode = getattr(CacheMode, request.cache_mode)
113
 
 
114
  async with AsyncWebCrawler() as crawler:
115
  config = CrawlerRunConfig(
116
  cache_mode=cache_mode,
117
  excluded_tags=request.excluded_tags,
118
- remove_overlay_elements=request.remove_overlay_elements
 
 
 
 
 
 
 
119
  )
120
 
121
  result = await crawler.arun(
@@ -123,18 +189,29 @@ async def crawl_url(request: CrawlRequest):
123
  config=config
124
  )
125
 
126
- # Extract articles and metadata
127
  markdown = result.markdown_v2.raw_markdown
 
 
 
128
  articles = extract_articles(markdown)
129
- metadata = extract_metadata(markdown)
130
- metadata["total_articles"] = len(articles)
 
 
 
131
 
132
  return CrawlResponse(
133
  url=str(request.url),
134
  success=result.success,
135
  metadata=metadata,
136
  articles=articles,
137
- raw_markdown=markdown if result.success else None
 
 
 
 
 
138
  )
139
 
140
  except Exception as e:
 
21
  class CrawlRequest(BaseModel):
22
  url: HttpUrl
23
  cache_mode: str = "ENABLED"
24
+ excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
25
  remove_overlay_elements: bool = True
26
+ max_pages: int = 1 # Limit number of pages to crawl
27
+ timeout: int = 30 # Timeout in seconds
28
 
29
  class Article(BaseModel):
30
  title: str
 
33
  image_url: Optional[str] = None
34
  timestamp: Optional[str] = None
35
  category: Optional[str] = None
36
+ source_url: Optional[str] = None # Added to track original source
37
 
38
  class CrawlResponse(BaseModel):
39
  url: str
 
42
  metadata: Dict = {}
43
  articles: List[Article] = []
44
  raw_markdown: Optional[str] = None
45
+ stats: Dict = {}
46
+
47
+ def clean_url(url: str) -> str:
48
+ """Clean and normalize URLs"""
49
+ # Remove angle brackets and extra domains
50
+ url = url.replace('<', '').replace('>', '')
51
+
52
+ # Fix double domain issues
53
+ if 'https://' in url[8:]: # Check after first https://
54
+ url = url.replace('https://', '', 1) # Remove first occurrence
55
+
56
+ # Remove any markdown or text formatting
57
+ url = url.split(' ')[0].split(')')[0]
58
+
59
+ return url
60
+
61
+ def is_valid_title(title: str) -> bool:
62
+ """Check if the title is valid"""
63
+ invalid_patterns = [
64
+ '**_access_time_',
65
+ 'existing code',
66
+ '...',
67
+ 'navigation',
68
+ 'menu',
69
+ 'logo'
70
+ ]
71
+
72
+ # Check for invalid patterns
73
+ if any(pattern in title.lower() for pattern in invalid_patterns):
74
+ return False
75
+
76
+ # Check if it's likely a filename or URL
77
+ if title.count('-') > 3 or title.count('_') > 2:
78
+ return False
79
+
80
+ # Check if title is too short
81
+ if len(title.strip()) < 5:
82
+ return False
83
+
84
+ return True
85
+
86
+ def clean_description(description: str) -> Optional[str]:
87
+ """Clean and normalize description text"""
88
+ if not description:
89
+ return None
90
+
91
+ # Remove markdown links
92
+ description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
93
+
94
+ # Remove URLs
95
+ description = re.sub(r'https?://\S+', '', description)
96
+
97
+ # Remove special characters and extra whitespace
98
+ description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
99
+ description = ' '.join(description.split())
100
+
101
+ return description if len(description) > 10 else None
102
 
103
  def extract_articles(markdown: str) -> List[Article]:
104
  articles = []
105
 
106
+ # Updated regex pattern
107
  article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
108
  matches = re.finditer(article_pattern, markdown, re.DOTALL)
109
 
110
+ seen_urls = set() # Track unique URLs
111
+
112
  for match in matches:
 
 
113
  title = match.group(2) # Article title
114
  url = match.group(3) # Article URL
115
  description = match.group(6) # Description text
116
 
117
+ # Skip if title is invalid
118
+ if not is_valid_title(title):
119
+ continue
120
+
121
+ # Clean and validate URL
122
+ url = clean_url(url)
123
 
124
+ # Skip if URL already processed or is an image
125
+ if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
126
  continue
127
 
128
+ seen_urls.add(url)
129
+
130
+ # Clean description
131
+ clean_desc = clean_description(description)
132
+
133
+ # Extract image URL if present
134
  image_url = None
135
  image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
136
  if image_match:
137
+ image_url = clean_url(image_match.group(2))
 
 
 
 
 
 
 
 
 
 
138
 
139
+ article = Article(
140
+ title=title.strip(),
141
+ url=url,
142
+ description=clean_desc,
143
+ image_url=image_url,
144
+ timestamp=None,
145
+ category=None,
146
+ source_url=None
147
+ )
148
+ articles.append(article)
149
 
150
  return articles
151
 
152
+ def extract_metadata(markdown: str, html: str) -> Dict:
153
  metadata = {
154
  "timestamp": datetime.now().isoformat(),
155
  "categories": [],
 
167
  @app.post("/crawl", response_model=CrawlResponse)
168
  async def crawl_url(request: CrawlRequest):
169
  try:
 
170
  cache_mode = getattr(CacheMode, request.cache_mode)
171
 
172
+ # Create crawler with improved configuration
173
  async with AsyncWebCrawler() as crawler:
174
  config = CrawlerRunConfig(
175
  cache_mode=cache_mode,
176
  excluded_tags=request.excluded_tags,
177
+ remove_overlay_elements=request.remove_overlay_elements,
178
+ max_pages=request.max_pages,
179
+ timeout=request.timeout,
180
+ # Added from quickstart examples
181
+ remove_ads=True,
182
+ extract_text=True,
183
+ extract_links=True,
184
+ extract_images=True
185
  )
186
 
187
  result = await crawler.arun(
 
189
  config=config
190
  )
191
 
192
+ # Use both markdown and HTML results for better extraction
193
  markdown = result.markdown_v2.raw_markdown
194
+ html = result.html
195
+
196
+ # Extract content using both markdown and HTML
197
  articles = extract_articles(markdown)
198
+ metadata = extract_metadata(markdown, html)
199
+
200
+ # Add source URL to articles
201
+ for article in articles:
202
+ article.source_url = str(request.url)
203
 
204
  return CrawlResponse(
205
  url=str(request.url),
206
  success=result.success,
207
  metadata=metadata,
208
  articles=articles,
209
+ raw_markdown=markdown if result.success else None,
210
+ stats={
211
+ "total_links": len(result.links) if result.links else 0,
212
+ "total_images": len(result.images) if result.images else 0,
213
+ "processing_time": result.processing_time if hasattr(result, 'processing_time') else None
214
+ }
215
  )
216
 
217
  except Exception as e: