Spaces:
Sleeping
Sleeping
Commit
·
6c4f9d7
1
Parent(s):
8ecfdd5
update api handle 3
Browse files
app.py
CHANGED
@@ -21,8 +21,10 @@ app = FastAPI(
|
|
21 |
class CrawlRequest(BaseModel):
|
22 |
url: HttpUrl
|
23 |
cache_mode: str = "ENABLED"
|
24 |
-
excluded_tags: list[str] = ["nav", "footer", "aside"]
|
25 |
remove_overlay_elements: bool = True
|
|
|
|
|
26 |
|
27 |
class Article(BaseModel):
|
28 |
title: str
|
@@ -31,6 +33,7 @@ class Article(BaseModel):
|
|
31 |
image_url: Optional[str] = None
|
32 |
timestamp: Optional[str] = None
|
33 |
category: Optional[str] = None
|
|
|
34 |
|
35 |
class CrawlResponse(BaseModel):
|
36 |
url: str
|
@@ -39,58 +42,114 @@ class CrawlResponse(BaseModel):
|
|
39 |
metadata: Dict = {}
|
40 |
articles: List[Article] = []
|
41 |
raw_markdown: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def extract_articles(markdown: str) -> List[Article]:
|
44 |
articles = []
|
45 |
|
46 |
-
# Updated regex pattern
|
47 |
article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
|
48 |
matches = re.finditer(article_pattern, markdown, re.DOTALL)
|
49 |
|
|
|
|
|
50 |
for match in matches:
|
51 |
-
# Extract components
|
52 |
-
image_title = match.group(1) # Image alt text if exists
|
53 |
title = match.group(2) # Article title
|
54 |
url = match.group(3) # Article URL
|
55 |
description = match.group(6) # Description text
|
56 |
|
57 |
-
#
|
58 |
-
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
# Skip
|
61 |
-
if
|
62 |
continue
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
65 |
image_url = None
|
66 |
image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
|
67 |
if image_match:
|
68 |
-
image_url = image_match.group(2)
|
69 |
-
description = description.replace(image_match.group(0), '').strip()
|
70 |
-
|
71 |
-
# Clean up description
|
72 |
-
if description:
|
73 |
-
# Remove markdown links from description
|
74 |
-
description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
|
75 |
-
# Remove quotes
|
76 |
-
description = description.replace('"', '').strip()
|
77 |
-
# Remove multiple spaces
|
78 |
-
description = ' '.join(description.split())
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
|
91 |
return articles
|
92 |
|
93 |
-
def extract_metadata(markdown: str) -> Dict:
|
94 |
metadata = {
|
95 |
"timestamp": datetime.now().isoformat(),
|
96 |
"categories": [],
|
@@ -108,14 +167,21 @@ def extract_metadata(markdown: str) -> Dict:
|
|
108 |
@app.post("/crawl", response_model=CrawlResponse)
|
109 |
async def crawl_url(request: CrawlRequest):
|
110 |
try:
|
111 |
-
# Convert cache_mode string to enum
|
112 |
cache_mode = getattr(CacheMode, request.cache_mode)
|
113 |
|
|
|
114 |
async with AsyncWebCrawler() as crawler:
|
115 |
config = CrawlerRunConfig(
|
116 |
cache_mode=cache_mode,
|
117 |
excluded_tags=request.excluded_tags,
|
118 |
-
remove_overlay_elements=request.remove_overlay_elements
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
)
|
120 |
|
121 |
result = await crawler.arun(
|
@@ -123,18 +189,29 @@ async def crawl_url(request: CrawlRequest):
|
|
123 |
config=config
|
124 |
)
|
125 |
|
126 |
-
#
|
127 |
markdown = result.markdown_v2.raw_markdown
|
|
|
|
|
|
|
128 |
articles = extract_articles(markdown)
|
129 |
-
metadata = extract_metadata(markdown)
|
130 |
-
|
|
|
|
|
|
|
131 |
|
132 |
return CrawlResponse(
|
133 |
url=str(request.url),
|
134 |
success=result.success,
|
135 |
metadata=metadata,
|
136 |
articles=articles,
|
137 |
-
raw_markdown=markdown if result.success else None
|
|
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
|
140 |
except Exception as e:
|
|
|
21 |
class CrawlRequest(BaseModel):
|
22 |
url: HttpUrl
|
23 |
cache_mode: str = "ENABLED"
|
24 |
+
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
25 |
remove_overlay_elements: bool = True
|
26 |
+
max_pages: int = 1 # Limit number of pages to crawl
|
27 |
+
timeout: int = 30 # Timeout in seconds
|
28 |
|
29 |
class Article(BaseModel):
|
30 |
title: str
|
|
|
33 |
image_url: Optional[str] = None
|
34 |
timestamp: Optional[str] = None
|
35 |
category: Optional[str] = None
|
36 |
+
source_url: Optional[str] = None # Added to track original source
|
37 |
|
38 |
class CrawlResponse(BaseModel):
|
39 |
url: str
|
|
|
42 |
metadata: Dict = {}
|
43 |
articles: List[Article] = []
|
44 |
raw_markdown: Optional[str] = None
|
45 |
+
stats: Dict = {}
|
46 |
+
|
47 |
+
def clean_url(url: str) -> str:
|
48 |
+
"""Clean and normalize URLs"""
|
49 |
+
# Remove angle brackets and extra domains
|
50 |
+
url = url.replace('<', '').replace('>', '')
|
51 |
+
|
52 |
+
# Fix double domain issues
|
53 |
+
if 'https://' in url[8:]: # Check after first https://
|
54 |
+
url = url.replace('https://', '', 1) # Remove first occurrence
|
55 |
+
|
56 |
+
# Remove any markdown or text formatting
|
57 |
+
url = url.split(' ')[0].split(')')[0]
|
58 |
+
|
59 |
+
return url
|
60 |
+
|
61 |
+
def is_valid_title(title: str) -> bool:
|
62 |
+
"""Check if the title is valid"""
|
63 |
+
invalid_patterns = [
|
64 |
+
'**_access_time_',
|
65 |
+
'existing code',
|
66 |
+
'...',
|
67 |
+
'navigation',
|
68 |
+
'menu',
|
69 |
+
'logo'
|
70 |
+
]
|
71 |
+
|
72 |
+
# Check for invalid patterns
|
73 |
+
if any(pattern in title.lower() for pattern in invalid_patterns):
|
74 |
+
return False
|
75 |
+
|
76 |
+
# Check if it's likely a filename or URL
|
77 |
+
if title.count('-') > 3 or title.count('_') > 2:
|
78 |
+
return False
|
79 |
+
|
80 |
+
# Check if title is too short
|
81 |
+
if len(title.strip()) < 5:
|
82 |
+
return False
|
83 |
+
|
84 |
+
return True
|
85 |
+
|
86 |
+
def clean_description(description: str) -> Optional[str]:
|
87 |
+
"""Clean and normalize description text"""
|
88 |
+
if not description:
|
89 |
+
return None
|
90 |
+
|
91 |
+
# Remove markdown links
|
92 |
+
description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
|
93 |
+
|
94 |
+
# Remove URLs
|
95 |
+
description = re.sub(r'https?://\S+', '', description)
|
96 |
+
|
97 |
+
# Remove special characters and extra whitespace
|
98 |
+
description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
|
99 |
+
description = ' '.join(description.split())
|
100 |
+
|
101 |
+
return description if len(description) > 10 else None
|
102 |
|
103 |
def extract_articles(markdown: str) -> List[Article]:
|
104 |
articles = []
|
105 |
|
106 |
+
# Updated regex pattern
|
107 |
article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
|
108 |
matches = re.finditer(article_pattern, markdown, re.DOTALL)
|
109 |
|
110 |
+
seen_urls = set() # Track unique URLs
|
111 |
+
|
112 |
for match in matches:
|
|
|
|
|
113 |
title = match.group(2) # Article title
|
114 |
url = match.group(3) # Article URL
|
115 |
description = match.group(6) # Description text
|
116 |
|
117 |
+
# Skip if title is invalid
|
118 |
+
if not is_valid_title(title):
|
119 |
+
continue
|
120 |
+
|
121 |
+
# Clean and validate URL
|
122 |
+
url = clean_url(url)
|
123 |
|
124 |
+
# Skip if URL already processed or is an image
|
125 |
+
if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
|
126 |
continue
|
127 |
|
128 |
+
seen_urls.add(url)
|
129 |
+
|
130 |
+
# Clean description
|
131 |
+
clean_desc = clean_description(description)
|
132 |
+
|
133 |
+
# Extract image URL if present
|
134 |
image_url = None
|
135 |
image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
|
136 |
if image_match:
|
137 |
+
image_url = clean_url(image_match.group(2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
+
article = Article(
|
140 |
+
title=title.strip(),
|
141 |
+
url=url,
|
142 |
+
description=clean_desc,
|
143 |
+
image_url=image_url,
|
144 |
+
timestamp=None,
|
145 |
+
category=None,
|
146 |
+
source_url=None
|
147 |
+
)
|
148 |
+
articles.append(article)
|
149 |
|
150 |
return articles
|
151 |
|
152 |
+
def extract_metadata(markdown: str, html: str) -> Dict:
|
153 |
metadata = {
|
154 |
"timestamp": datetime.now().isoformat(),
|
155 |
"categories": [],
|
|
|
167 |
@app.post("/crawl", response_model=CrawlResponse)
|
168 |
async def crawl_url(request: CrawlRequest):
|
169 |
try:
|
|
|
170 |
cache_mode = getattr(CacheMode, request.cache_mode)
|
171 |
|
172 |
+
# Create crawler with improved configuration
|
173 |
async with AsyncWebCrawler() as crawler:
|
174 |
config = CrawlerRunConfig(
|
175 |
cache_mode=cache_mode,
|
176 |
excluded_tags=request.excluded_tags,
|
177 |
+
remove_overlay_elements=request.remove_overlay_elements,
|
178 |
+
max_pages=request.max_pages,
|
179 |
+
timeout=request.timeout,
|
180 |
+
# Added from quickstart examples
|
181 |
+
remove_ads=True,
|
182 |
+
extract_text=True,
|
183 |
+
extract_links=True,
|
184 |
+
extract_images=True
|
185 |
)
|
186 |
|
187 |
result = await crawler.arun(
|
|
|
189 |
config=config
|
190 |
)
|
191 |
|
192 |
+
# Use both markdown and HTML results for better extraction
|
193 |
markdown = result.markdown_v2.raw_markdown
|
194 |
+
html = result.html
|
195 |
+
|
196 |
+
# Extract content using both markdown and HTML
|
197 |
articles = extract_articles(markdown)
|
198 |
+
metadata = extract_metadata(markdown, html)
|
199 |
+
|
200 |
+
# Add source URL to articles
|
201 |
+
for article in articles:
|
202 |
+
article.source_url = str(request.url)
|
203 |
|
204 |
return CrawlResponse(
|
205 |
url=str(request.url),
|
206 |
success=result.success,
|
207 |
metadata=metadata,
|
208 |
articles=articles,
|
209 |
+
raw_markdown=markdown if result.success else None,
|
210 |
+
stats={
|
211 |
+
"total_links": len(result.links) if result.links else 0,
|
212 |
+
"total_images": len(result.images) if result.images else 0,
|
213 |
+
"processing_time": result.processing_time if hasattr(result, 'processing_time') else None
|
214 |
+
}
|
215 |
)
|
216 |
|
217 |
except Exception as e:
|