Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / web_util.py

gdms

Montagem arquitetura

30944a6 3 months ago

raw

history blame contribute delete

3.45 kB

	from typing import Tuple
	from bs4 import BeautifulSoup
	import markdownify
	import requests


	class Web_Util:
	HEADERS = {
	'User-Agent': 'MyCoolSearchBot/1.0 ([email protected])'
	}

	@staticmethod
	def is_wikipedia_url(url: str) -> bool:
	"""Verifica se uma URL pertence ao domínio da Wikipedia."""
	return "wikipedia.org" in url.lower()

	@staticmethod
	def _limpar_html(html: str) -> Tuple[str, str]:
	"""
	Remove tags <script>, <style> e atributos inline.
	Args:
	html: HTML a ser limpo.
	Returns:
	Titulo da pagina e html limpo.

	"""

	soup = BeautifulSoup(html, 'html.parser')

	# Extrai o título da página (primeiro tenta <title>, depois <h1>)
	title_tag = soup.find('title')
	title = title_tag.get_text(strip=True) if title_tag else None

	# Remove tags <script> e <style>
	for tag in soup(['script', 'style']):
	tag.decompose()
	# Remove tags <img>
	for img in soup.find_all('img'):
	img.decompose()
	# Remove atributos que aplicam CSS ou JS inline
	for tag in soup.find_all(True):
	for attr in ['style', 'onclick', 'onmouseover', 'onload', 'class', 'id']:
	if attr in tag.attrs:
	del tag.attrs[attr]

	return title, str(soup)

	@staticmethod
	def download_html(url: str) -> Tuple[str, str]:
	"""
	Baixa o conteúdo HTML de uma URL, retornando também o titulo.
	Args:
	url: URL a ser baixada.
	Returns:
	Uma tupla contendo o título e o conteúdo HTML.
	"""
	print(f"Baixando e convertendo: {url}")
	try:
	response = requests.get(url, headers=Web_Util.HEADERS, timeout=20)
	response.raise_for_status() # Verifica se houve erro no request
	# Tenta detectar a codificação, mas assume UTF-8 como fallback
	response.encoding = response.apparent_encoding or 'utf-8'
	html_content = response.text
	# Usa readability para extrair o conteúdo principal
	title, cleaned_html = Web_Util._limpar_html(html_content)

	return title, cleaned_html

	except requests.exceptions.RequestException as e:
	print(f"Erro ao acessar a URL (requestException) {url}: {e}")
	return None
	except Exception as e:
	print(f"Erro ao acessar a URL (erro genérico) {url}: {e}")
	return None

	@staticmethod
	def convert_html_to_markdown(title: str, html: str) -> str:
	"""Converte o html para markdown."""
	try:
	md_content = markdownify.markdownify(
	html,
	heading_style="ATX",
	strip=['script', 'style'],
	escape_underscores=False)

	return f"# {title}\n\n" + md_content.strip()
	except Exception as e:
	print(f"Erro ao converter HTML para Markdown: {e}")
	return None

	@staticmethod
	def download_html_and_convert_to_md(url: str) -> str:
	"""Baixa o conteúdo HTML de uma URL e o converte para Markdown."""
	title, html = Web_Util.download_html(url)
	if title and html:
	return Web_Util.convert_html_to_markdown(title)
	else:
	return None