NightFury2710 commited on
Commit
7622d5e
·
1 Parent(s): ade6557

update api handle 3

Browse files
Files changed (1) hide show
  1. app.py +17 -6
app.py CHANGED
@@ -44,16 +44,24 @@ class CrawlResponse(BaseModel):
44
 
45
  def clean_url(url: str) -> str:
46
  """Clean and normalize URLs"""
47
- # Remove angle brackets and extra domains
48
  url = url.replace('<', '').replace('>', '')
49
 
50
  # Fix double domain issues
51
  if 'https://' in url[8:]: # Check after first https://
52
- url = url.replace('https://', '', 1) # Remove first occurrence
 
 
 
 
 
53
 
54
- # Remove any markdown or text formatting
55
  url = url.split(' ')[0].split(')')[0]
56
 
 
 
 
57
  return url
58
 
59
  def is_valid_title(title: str) -> bool:
@@ -86,6 +94,10 @@ def clean_description(description: str) -> Optional[str]:
86
  if not description:
87
  return None
88
 
 
 
 
 
89
  # Remove markdown links
90
  description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
91
 
@@ -100,13 +112,12 @@ def clean_description(description: str) -> Optional[str]:
100
 
101
  def extract_articles(markdown: str) -> List[Article]:
102
  articles = []
 
103
 
104
  # Updated regex pattern
105
  article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
106
  matches = re.finditer(article_pattern, markdown, re.DOTALL)
107
 
108
- seen_urls = set() # Track unique URLs
109
-
110
  for match in matches:
111
  title = match.group(2) # Article title
112
  url = match.group(3) # Article URL
@@ -141,7 +152,7 @@ def extract_articles(markdown: str) -> List[Article]:
141
  image_url=image_url,
142
  timestamp=None,
143
  category=None,
144
- source_url=None
145
  )
146
  articles.append(article)
147
 
 
44
 
45
  def clean_url(url: str) -> str:
46
  """Clean and normalize URLs"""
47
+ # Remove angle brackets
48
  url = url.replace('<', '').replace('>', '')
49
 
50
  # Fix double domain issues
51
  if 'https://' in url[8:]: # Check after first https://
52
+ # Extract the actual path after the second https://
53
+ parts = url.split('https://', 2)
54
+ if len(parts) > 2:
55
+ url = 'https://' + parts[2]
56
+ else:
57
+ url = 'https://' + parts[1]
58
 
59
+ # Remove any markdown formatting or extra parameters
60
  url = url.split(' ')[0].split(')')[0]
61
 
62
+ # Remove any trailing slashes
63
+ url = url.rstrip('/')
64
+
65
  return url
66
 
67
  def is_valid_title(title: str) -> bool:
 
94
  if not description:
95
  return None
96
 
97
+ # Remove access_time markers
98
+ if '_access_time_' in description:
99
+ return None
100
+
101
  # Remove markdown links
102
  description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
103
 
 
112
 
113
  def extract_articles(markdown: str) -> List[Article]:
114
  articles = []
115
+ seen_urls = set() # Track unique URLs
116
 
117
  # Updated regex pattern
118
  article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
119
  matches = re.finditer(article_pattern, markdown, re.DOTALL)
120
 
 
 
121
  for match in matches:
122
  title = match.group(2) # Article title
123
  url = match.group(3) # Article URL
 
152
  image_url=image_url,
153
  timestamp=None,
154
  category=None,
155
+ source_url=None # Will be set later
156
  )
157
  articles.append(article)
158