Spaces:

kokluch
/

phishing-detector-api

Sleeping

phishing-detector-api / url_tools.py

Add urlscan.io to classify as junk malicious urls.

d9f1916 4 months ago

1.08 kB

	import re
	from urllib.parse import urlparse, urlunparse
	import httpx

	def extract_urls(text: str):
	"""Extract URLs from raw text."""
	url_pattern = r"""(?:(?:https?:\/\/\|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
	return re.findall(url_pattern, text)


	def normalize_url(url: str) -> str:
	"""Ensure the URL has a scheme and is normalized."""
	parsed = urlparse(url, scheme="http")
	if not parsed.netloc:
	parsed = urlparse("http://" + url)
	return urlunparse(parsed)

	def resolve_short_url(url: str) -> str:
	"""Make a HEAD request without following redirects, return the Location if redirected."""
	url = normalize_url(url)
	try:
	with httpx.Client(follow_redirects=False, timeout=5) as client:
	response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code in {301, 302, 303, 307, 308}:
	return response.headers.get("location")
	return url # No redirect
	except httpx.RequestError as e:
	print(f"Error: {e}")
	return url