Spaces:

mikeee
/

mlbee

Running

mlbee / st_mlbee /url2txt.py

freemt

Change pypi-name mv mlbee st_mlbee

6bf2aa9 about 3 years ago

2.38 kB

	"""Fetch text from url."""
	from typing import Optional
	from urllib.parse import urlparse

	import html2text
	import httpx
	import streamlit as st
	from logzero import logger
	from readability import Document


	# @st.cache
	def url2txt(
	url: str,
	bodywidth: Optional[int] = 5000,
	remove: bool = False,
	show_url: bool = True,
	ignore_links: bool = True,
	) -> str:
	"""Fetch text from url.

	Args:
	url: netloc from which to fetch text
	bodywidth: if set to None, fall back to default bodywidth of
	html2text.HTML2Text
	remove: remove blank lines if set to True
	show_url: prepend url if set to True
	ignore_links: remove [ur](url)

	Return:
	main body in text

	bodywidth: Optional[int] = 5000
	remove: bool = False
	show_url: bool = True
	ignore_links: bool = True
	"""
	url = url.strip()
	if not url.startswith("http"):
	url = "http://" + url

	logger.info("url: %s", url)

	parsed = urlparse(url)
	if not parsed.scheme or not parsed.netloc: # no scheme or netloc present
	raise Exception(f"Invalid url: {url}")

	try:
	resp = httpx.get(url, timeout=30)
	resp.raise_for_status()
	except Exception as exc:
	logger.error(exc)
	raise

	try:
	content_type = resp.headers["content-type"]
	except Exception as e:
	logger.error(e)
	content_type = ""
	# output text if text/plain
	if "text/plain" in content_type:
	return resp.text

	# handle html and the rest
	try:
	doc = Document(resp.text)
	except Exception as exc:
	logger.error(exc)
	raise

	if not doc.summary().strip():
	raise Exception("No content for some reason...")

	if bodywidth is not None:
	handle = html2text.HTML2Text(bodywidth=bodywidth)
	else:
	handle = html2text.HTML2Text()

	handle.ignore_links = ignore_links

	try:
	res = handle.handle(doc.summary())
	except Exception as exc:
	logger.error(exc)
	raise

	# remove double blank lines
	if remove:
	res = "\n".join(elm for elm in res.splitlines() if elm.strip())

	if not res.strip(): # warn if empty output
	logger.warning("Output seems to be empty...")

	if show_url:
	return f"{url}\n# {doc.title()}\n{res}"

	return f"# {doc.title()}\n{res}"