Spaces:
Sleeping
Sleeping
Commit
·
d99bc8b
1
Parent(s):
7622d5e
update api handle 3
Browse files
app.py
CHANGED
@@ -44,25 +44,31 @@ class CrawlResponse(BaseModel):
|
|
44 |
|
45 |
def clean_url(url: str) -> str:
|
46 |
"""Clean and normalize URLs"""
|
47 |
-
# Remove angle brackets
|
48 |
-
url = url.replace('<', '').replace('>', '')
|
49 |
|
50 |
-
#
|
51 |
-
if 'https://'
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
# Remove any markdown formatting or extra parameters
|
60 |
-
|
61 |
|
62 |
# Remove any trailing slashes
|
63 |
-
|
64 |
|
65 |
-
return
|
66 |
|
67 |
def is_valid_title(title: str) -> bool:
|
68 |
"""Check if the title is valid"""
|
|
|
44 |
|
45 |
def clean_url(url: str) -> str:
|
46 |
"""Clean and normalize URLs"""
|
47 |
+
# Remove angle brackets and spaces
|
48 |
+
url = url.replace('<', '').replace('>', '').strip()
|
49 |
|
50 |
+
# Extract domain from the first https:// occurrence
|
51 |
+
if url.startswith('https://'):
|
52 |
+
domain = url[8:].split('/')[0]
|
53 |
+
|
54 |
+
# Remove any duplicate domains
|
55 |
+
cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
|
56 |
+
cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
|
57 |
+
cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
|
58 |
+
|
59 |
+
# Ensure proper https:// prefix
|
60 |
+
if not cleaned_url.startswith('https://'):
|
61 |
+
cleaned_url = f'https://{cleaned_url}'
|
62 |
+
else:
|
63 |
+
cleaned_url = url
|
64 |
|
65 |
# Remove any markdown formatting or extra parameters
|
66 |
+
cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
|
67 |
|
68 |
# Remove any trailing slashes
|
69 |
+
cleaned_url = cleaned_url.rstrip('/')
|
70 |
|
71 |
+
return cleaned_url
|
72 |
|
73 |
def is_valid_title(title: str) -> bool:
|
74 |
"""Check if the title is valid"""
|