Spaces:
Sleeping
Sleeping
File size: 1,078 Bytes
d9f1916 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import re
from urllib.parse import urlparse, urlunparse
import httpx
def extract_urls(text: str):
"""Extract URLs from raw text."""
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
return re.findall(url_pattern, text)
def normalize_url(url: str) -> str:
"""Ensure the URL has a scheme and is normalized."""
parsed = urlparse(url, scheme="http")
if not parsed.netloc:
parsed = urlparse("http://" + url)
return urlunparse(parsed)
def resolve_short_url(url: str) -> str:
"""Make a HEAD request without following redirects, return the Location if redirected."""
url = normalize_url(url)
try:
with httpx.Client(follow_redirects=False, timeout=5) as client:
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code in {301, 302, 303, 307, 308}:
return response.headers.get("location")
return url # No redirect
except httpx.RequestError as e:
print(f"Error: {e}")
return url |