Spaces:

John-Jiang
/

starfish_data_ai

Running

App Files Files Community

starfish_data_ai / src /starfish /data_ingest /parsers /html_parser.py

John-Jiang

init commit

5301c48 2 months ago

raw

history blame

2.9 kB

	from urllib.parse import urlparse
	import requests
	from typing import Dict, Any, Tuple
	from starfish.data_ingest.parsers.base_parser import BaseParser


	class HTMLDocumentParser(BaseParser):
	def __init__(self):
	super().__init__()
	self._bs4 = None
	self.metadata = {}

	def _load_bs4(self):
	if self._bs4 is None:
	try:
	from bs4 import BeautifulSoup

	self._bs4 = BeautifulSoup
	except ImportError:
	raise ImportError("BeautifulSoup is required for HTML parsing. Install it with: pip install beautifulsoup4")

	def parse(self, file_path: str) -> str:
	"""Parse an HTML file or URL into plain text and extract metadata

	Args:
	file_path: Path to the HTML file or URL

	Returns:
	Tuple of (extracted text, metadata dictionary)
	"""
	self._load_bs4()

	self.metadata = {}

	# Determine if file_path is a URL or a local file
	if file_path.startswith(("http://", "https://")):
	# It's a URL, fetch content
	response = requests.get(file_path)
	response.raise_for_status()
	html_content = response.text
	else:
	# It's a local file, read it
	with open(file_path, "r", encoding="utf-8") as f:
	html_content = f.read()

	# Parse HTML and extract text
	soup = self._bs4(html_content, "html.parser")

	# Extract metadata
	if soup.title:
	self.metadata["title"] = soup.title.string
	if soup.find("meta", attrs={"name": "description"}):
	self.metadata["description"] = soup.find("meta", attrs={"name": "description"})["content"]
	if soup.find("meta", attrs={"property": "og:type"}):
	self.metadata["type"] = soup.find("meta", attrs={"property": "og:type"})["content"]
	if soup.find("meta", attrs={"charset": True}):
	self.metadata["charset"] = soup.find("meta", attrs={"charset": True})["charset"]

	# Add URL metadata if parsing from URL
	if file_path.startswith(("http://", "https://")):
	parsed_url = urlparse(file_path)
	self.metadata["url"] = file_path
	self.metadata["domain"] = parsed_url.netloc
	self.metadata["path"] = parsed_url.path

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.extract()

	# Get text
	text = soup.get_text()

	# Break into lines and remove leading and trailing space
	lines = (line.strip() for line in text.splitlines())
	# Break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# Drop blank lines
	text = "\n".join(chunk for chunk in chunks if chunk)

	return text