File size: 5,097 Bytes
f2a2588 da05e38 f2a2588 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import re
import requests
from bs4 import BeautifulSoup , Comment
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
class Preprocessor(ABC):
"""
Abstract base class for preprocessors.
Defines the interface for transforming raw inputs into structured data.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
"""
Initialize the preprocessor with optional configuration.
Args:
config: A dictionary of configuration settings.
- keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
"""
self.config = config if config is not None else {'keep_tags': False}
def _fetch_content(self, url: str) -> str:
"""
Fetches and parses the text content from a URL.
Args:
url: The URL to fetch content from.
Returns:
The clean, extracted text content from the page.
Raises:
ValueError: If the URL cannot be fetched or processed.
"""
try:
# Set a User-Agent header to mimic a browser, which can help avoid
# being blocked by some websites.
# Inside _fetch_content method
headers = headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.6",
"Cache-Control": "max-age=0",
"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}
# Make the HTTP GET request with a timeout.
response = requests.get(url, headers=headers, timeout=15)
return response.text
except requests.exceptions.RequestException as e:
# Catch any network-related errors (DNS, connection, timeout, etc.)
# and re-raise them as a more user-friendly ValueError.
raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
@abstractmethod
def preprocess(self, content: str, is_url: bool) -> str:
"""
Take raw content (HTML, text, etc.) and apply preprocessing steps.
Args:
content: The raw data to preprocess.
Returns:
A dictionary containing structured, cleaned data ready for downstream tasks.
"""
pass
class BasicPreprocessor(Preprocessor):
"""
Base preprocessor with common functionality.
Can be extended for specific preprocessing tasks.
"""
# TODO: Might need to think of how to improve this later
def _clean_html(self, html_content: str) -> str:
"""
Cleans up the given HTML content by:
- Removing <script> and <style> tags and their content.
- Removing HTML comments.
- Extracting and returning the visible text with normalized whitespace if keep_tags is False.
Args:
html_content (str): The HTML content to clean.
Returns:
str: The cleaned, visible text from the HTML.
"""
# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style elements
for tag in soup(["script", "style"]):
tag.decompose()
# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Extract text and normalize whitespace
if self.config.get('keep_tags', False):
# If keep_tags is True, return the raw HTML
return str(soup)
text = soup.get_text(separator=" ", strip=True)
clean_text = re.sub(r'\s+', ' ', text)
return clean_text
def preprocess(self, content: str, is_url: bool) -> str:
"""
Take raw content (HTML, text, etc.) and apply preprocessing steps.
Args:
content: The raw data to preprocess.
Returns:
A dictionary containing structured, cleaned data ready for downstream tasks.
"""
html_content = content
if is_url:
# Fetch content from the URL
html_content = self._fetch_content(content)
# Clean the HTML content
cleaned_content = self._clean_html(html_content)
return cleaned_content.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|