Spaces:

NightFury2710
/

myCrawl4ai

Sleeping

NightFury2710 commited on Jan 29

Commit

d99bc8b

1 Parent(s): 7622d5e

update api handle 3

Files changed (1) hide show

app.py CHANGED Viewed

@@ -44,25 +44,31 @@ class CrawlResponse(BaseModel):
 def clean_url(url: str) -> str:
     """Clean and normalize URLs"""
-    # Remove angle brackets
-    url = url.replace('<', '').replace('>', '')
-    # Fix double domain issues
-    if 'https://' in url[8:]:  # Check after first https://
-        # Extract the actual path after the second https://
-        parts = url.split('https://', 2)
-        if len(parts) > 2:
-            url = 'https://' + parts[2]
-        else:
-            url = 'https://' + parts[1]
     # Remove any markdown formatting or extra parameters
-    url = url.split(' ')[0].split(')')[0]
     # Remove any trailing slashes
-    url = url.rstrip('/')
-    return url
 def is_valid_title(title: str) -> bool:
     """Check if the title is valid"""

 def clean_url(url: str) -> str:
     """Clean and normalize URLs"""
+    # Remove angle brackets and spaces
+    url = url.replace('<', '').replace('>', '').strip()
+    # Extract domain from the first https:// occurrence
+    if url.startswith('https://'):
+        domain = url[8:].split('/')[0]
+        # Remove any duplicate domains
+        cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
+        cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
+        cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
+        # Ensure proper https:// prefix
+        if not cleaned_url.startswith('https://'):
+            cleaned_url = f'https://{cleaned_url}'
+    else:
+        cleaned_url = url
     # Remove any markdown formatting or extra parameters
+    cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
     # Remove any trailing slashes
+    cleaned_url = cleaned_url.rstrip('/')
+    return cleaned_url
 def is_valid_title(title: str) -> bool:
     """Check if the title is valid"""