phishing-detector-api / url_tools.py
kokluch's picture
Add urlscan.io to classify as junk malicious urls.
d9f1916
raw
history blame
1.08 kB
import re
from urllib.parse import urlparse, urlunparse
import httpx
def extract_urls(text: str):
"""Extract URLs from raw text."""
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
return re.findall(url_pattern, text)
def normalize_url(url: str) -> str:
"""Ensure the URL has a scheme and is normalized."""
parsed = urlparse(url, scheme="http")
if not parsed.netloc:
parsed = urlparse("http://" + url)
return urlunparse(parsed)
def resolve_short_url(url: str) -> str:
"""Make a HEAD request without following redirects, return the Location if redirected."""
url = normalize_url(url)
try:
with httpx.Client(follow_redirects=False, timeout=5) as client:
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code in {301, 302, 303, 307, 308}:
return response.headers.get("location")
return url # No redirect
except httpx.RequestError as e:
print(f"Error: {e}")
return url