NightFury2710 commited on
Commit
d99bc8b
·
1 Parent(s): 7622d5e

update api handle 3

Browse files
Files changed (1) hide show
  1. app.py +19 -13
app.py CHANGED
@@ -44,25 +44,31 @@ class CrawlResponse(BaseModel):
44
 
45
  def clean_url(url: str) -> str:
46
  """Clean and normalize URLs"""
47
- # Remove angle brackets
48
- url = url.replace('<', '').replace('>', '')
49
 
50
- # Fix double domain issues
51
- if 'https://' in url[8:]: # Check after first https://
52
- # Extract the actual path after the second https://
53
- parts = url.split('https://', 2)
54
- if len(parts) > 2:
55
- url = 'https://' + parts[2]
56
- else:
57
- url = 'https://' + parts[1]
 
 
 
 
 
 
58
 
59
  # Remove any markdown formatting or extra parameters
60
- url = url.split(' ')[0].split(')')[0]
61
 
62
  # Remove any trailing slashes
63
- url = url.rstrip('/')
64
 
65
- return url
66
 
67
  def is_valid_title(title: str) -> bool:
68
  """Check if the title is valid"""
 
44
 
45
  def clean_url(url: str) -> str:
46
  """Clean and normalize URLs"""
47
+ # Remove angle brackets and spaces
48
+ url = url.replace('<', '').replace('>', '').strip()
49
 
50
+ # Extract domain from the first https:// occurrence
51
+ if url.startswith('https://'):
52
+ domain = url[8:].split('/')[0]
53
+
54
+ # Remove any duplicate domains
55
+ cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
56
+ cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
57
+ cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
58
+
59
+ # Ensure proper https:// prefix
60
+ if not cleaned_url.startswith('https://'):
61
+ cleaned_url = f'https://{cleaned_url}'
62
+ else:
63
+ cleaned_url = url
64
 
65
  # Remove any markdown formatting or extra parameters
66
+ cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
67
 
68
  # Remove any trailing slashes
69
+ cleaned_url = cleaned_url.rstrip('/')
70
 
71
+ return cleaned_url
72
 
73
  def is_valid_title(title: str) -> bool:
74
  """Check if the title is valid"""