John-Jiang's picture
init commit
5301c48
raw
history blame
2.9 kB
from urllib.parse import urlparse
import requests
from typing import Dict, Any, Tuple
from starfish.data_ingest.parsers.base_parser import BaseParser
class HTMLDocumentParser(BaseParser):
def __init__(self):
super().__init__()
self._bs4 = None
self.metadata = {}
def _load_bs4(self):
if self._bs4 is None:
try:
from bs4 import BeautifulSoup
self._bs4 = BeautifulSoup
except ImportError:
raise ImportError("BeautifulSoup is required for HTML parsing. Install it with: pip install beautifulsoup4")
def parse(self, file_path: str) -> str:
"""Parse an HTML file or URL into plain text and extract metadata
Args:
file_path: Path to the HTML file or URL
Returns:
Tuple of (extracted text, metadata dictionary)
"""
self._load_bs4()
self.metadata = {}
# Determine if file_path is a URL or a local file
if file_path.startswith(("http://", "https://")):
# It's a URL, fetch content
response = requests.get(file_path)
response.raise_for_status()
html_content = response.text
else:
# It's a local file, read it
with open(file_path, "r", encoding="utf-8") as f:
html_content = f.read()
# Parse HTML and extract text
soup = self._bs4(html_content, "html.parser")
# Extract metadata
if soup.title:
self.metadata["title"] = soup.title.string
if soup.find("meta", attrs={"name": "description"}):
self.metadata["description"] = soup.find("meta", attrs={"name": "description"})["content"]
if soup.find("meta", attrs={"property": "og:type"}):
self.metadata["type"] = soup.find("meta", attrs={"property": "og:type"})["content"]
if soup.find("meta", attrs={"charset": True}):
self.metadata["charset"] = soup.find("meta", attrs={"charset": True})["charset"]
# Add URL metadata if parsing from URL
if file_path.startswith(("http://", "https://")):
parsed_url = urlparse(file_path)
self.metadata["url"] = file_path
self.metadata["domain"] = parsed_url.netloc
self.metadata["path"] = parsed_url.path
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = "\n".join(chunk for chunk in chunks if chunk)
return text