Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /url_reader.py

pmkhanh7890

refactor code + fix bug of label after grouping url

00b1038 5 months ago

raw

history blame

4.23 kB

	import string

	import requests
	from bs4 import BeautifulSoup
	from newspaper import (
	ArticleBinaryDataException,
	ArticleException,
	article,
	)

	from src.application.config import MAX_URL_SIZE


	class URLReader:
	"""
	A class to extract content (title, text, images) from a given URL.
	Supports two extraction methods: newspaper4k and BeautifulSoup.
	"""

	def __init__(self, url: string, newspaper: bool = True):
	"""
	Initializes the URLReader object.

	Args:
	url: The URL to extract content from.
	newspaper: True to use newspaper4k, False to use BeautifulSoup.
	"""
	self.url: str = url
	self.text: str = None # Extracted text content
	self.title: str = None # Extracted title
	self.images: list[str] = None # list of image URLs
	self.top_image: str = None # URL of the top image
	self.is_extracted: bool = False # Indicating successful extraction

	url_size = self.get_size()
	if url_size is None or url_size > MAX_URL_SIZE:
	return
	else:
	self.is_extracted = True

	self.newspaper = newspaper
	if self.newspaper is True:
	self.extract_content_newspaper()
	else:
	self.extract_content_bs()

	def extract_content_newspaper(self):
	"""
	Extracts content from a URL using the newspaper4k library.
	"""
	try:
	response = requests.get(self.url)
	response.raise_for_status() # Raise HTTPError for bad responses

	news = article(url=self.url, fetch_images=True)

	self.title = news.title
	self.text = news.text
	self.images = list(set(news.images)) # Remove duplicates
	self.top_image = news.top_image

	except requests.exceptions.RequestException as e:
	print(f"Error fetching URL: {e}")
	return None
	except (ArticleException, ArticleBinaryDataException) as e:
	print(f"\t\t↑↑↑ Error downloading article: {e}")
	return None

	def extract_content_bs(self):
	"""
	Extracts content from a URL using BeautifulSoup.
	"""
	try:
	response = requests.get(self.url)
	response.raise_for_status()

	response.encoding = response.apparent_encoding # Detect encoding

	soup = BeautifulSoup(response.content, "html.parser")

	self.title = soup.title.string if soup.title else None

	image_urls = [img["src"] for img in soup.find_all("img")]
	self.images = image_urls
	self.top_image = self.images[0]

	# Remove unwanted elements from the HTML
	for element in soup(
	["img", "figcaption", "table", "script", "style"],
	):
	element.extract()

	paragraphs = soup.find_all("p")
	self.text = " ".join([p.get_text() for p in paragraphs])

	except requests.exceptions.RequestException as e:
	print(f"Error fetching URL: {e}")
	return None
	except Exception as e:
	print(f"Error parsing HTML content from {self.url}: {e}")
	return None

	def get_size(self):
	"""
	Retrieves the size of a URL's content using a HEAD request.
	"""
	try:
	response = requests.head(
	self.url,
	allow_redirects=True,
	timeout=5,
	)
	response.raise_for_status() # Raise HTTPError for bad responses

	content_length = response.headers.get("Content-Length")
	if content_length is not None:
	return int(content_length)
	else:
	print("\t\t↑↑↑ Content-Length header not found")
	return None

	except requests.exceptions.RequestException as e:
	print(f"\t\t↑↑↑ Error getting URL size: {e}")
	return None


	if __name__ == "__main__":
	url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
	reader = URLReader(url)
	print(f"Title: {reader.title}")
	print(f"Text: {reader.text}")