Spaces:

NightFury2710
/

myCrawl4ai

Sleeping

App Files Files Community

NightFury2710 commited on Jan 29

Commit

7622d5e

1 Parent(s): ade6557

update api handle 3

Browse files

Files changed (1) hide show

app.py +17 -6

app.py CHANGED Viewed

@@ -44,16 +44,24 @@ class CrawlResponse(BaseModel):
 def clean_url(url: str) -> str:
     """Clean and normalize URLs"""
-    # Remove angle brackets and extra domains
     url = url.replace('<', '').replace('>', '')
     # Fix double domain issues
     if 'https://' in url[8:]:  # Check after first https://
-        url = url.replace('https://', '', 1)  # Remove first occurrence
-    # Remove any markdown or text formatting
     url = url.split(' ')[0].split(')')[0]
     return url
 def is_valid_title(title: str) -> bool:
@@ -86,6 +94,10 @@ def clean_description(description: str) -> Optional[str]:
     if not description:
         return None
     # Remove markdown links
     description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
@@ -100,13 +112,12 @@ def clean_description(description: str) -> Optional[str]:
 def extract_articles(markdown: str) -> List[Article]:
     articles = []
     # Updated regex pattern
     article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
     matches = re.finditer(article_pattern, markdown, re.DOTALL)
-    seen_urls = set()  # Track unique URLs
     for match in matches:
         title = match.group(2)        # Article title
         url = match.group(3)          # Article URL
@@ -141,7 +152,7 @@ def extract_articles(markdown: str) -> List[Article]:
             image_url=image_url,
             timestamp=None,
             category=None,
-            source_url=None
         )
         articles.append(article)

 def clean_url(url: str) -> str:
     """Clean and normalize URLs"""
+    # Remove angle brackets
     url = url.replace('<', '').replace('>', '')
     # Fix double domain issues
     if 'https://' in url[8:]:  # Check after first https://
+        # Extract the actual path after the second https://
+        parts = url.split('https://', 2)
+        if len(parts) > 2:
+            url = 'https://' + parts[2]
+        else:
+            url = 'https://' + parts[1]
+    # Remove any markdown formatting or extra parameters
     url = url.split(' ')[0].split(')')[0]
+    # Remove any trailing slashes
+    url = url.rstrip('/')
     return url
 def is_valid_title(title: str) -> bool:
     if not description:
         return None
+    # Remove access_time markers
+    if '_access_time_' in description:
+        return None
     # Remove markdown links
     description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
 def extract_articles(markdown: str) -> List[Article]:
     articles = []
+    seen_urls = set()  # Track unique URLs
     # Updated regex pattern
     article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
     matches = re.finditer(article_pattern, markdown, re.DOTALL)
     for match in matches:
         title = match.group(2)        # Article title
         url = match.group(3)          # Article URL
             image_url=image_url,
             timestamp=None,
             category=None,
+            source_url=None  # Will be set later
         )
         articles.append(article)