Spaces:
Sleeping
Sleeping
| import string | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from newspaper import ( | |
| ArticleBinaryDataException, | |
| ArticleException, | |
| article, | |
| ) | |
| from src.application.config import MAX_URL_SIZE | |
| class URLReader: | |
| """ | |
| A class to extract content (title, text, images) from a given URL. | |
| Supports two extraction methods: newspaper4k and BeautifulSoup. | |
| """ | |
| def __init__(self, url: string, newspaper: bool = True): | |
| """ | |
| Initializes the URLReader object. | |
| Args: | |
| url: The URL to extract content from. | |
| newspaper: True to use newspaper4k, False to use BeautifulSoup. | |
| """ | |
| self.url: str = url | |
| self.text: str = None # Extracted text content | |
| self.title: str = None # Extracted title | |
| self.images: list[str] = None # list of image URLs | |
| self.top_image: str = None # URL of the top image | |
| self.is_extracted: bool = False # Indicating successful extraction | |
| url_size = self.get_size() | |
| if url_size is None or url_size > MAX_URL_SIZE: | |
| return | |
| else: | |
| self.is_extracted = True | |
| self.newspaper = newspaper | |
| if self.newspaper is True: | |
| self.extract_content_newspaper() | |
| else: | |
| self.extract_content_bs() | |
| def extract_content_newspaper(self): | |
| """ | |
| Extracts content from a URL using the newspaper4k library. | |
| """ | |
| try: | |
| response = requests.get(self.url) | |
| response.raise_for_status() # Raise HTTPError for bad responses | |
| news = article(url=self.url, fetch_images=True) | |
| self.title = news.title | |
| self.text = news.text | |
| self.images = list(set(news.images)) # Remove duplicates | |
| self.top_image = news.top_image | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching URL: {e}") | |
| return None | |
| except (ArticleException, ArticleBinaryDataException) as e: | |
| print(f"\t\tβββ Error downloading article: {e}") | |
| return None | |
| def extract_content_bs(self): | |
| """ | |
| Extracts content from a URL using BeautifulSoup. | |
| """ | |
| try: | |
| response = requests.get(self.url) | |
| response.raise_for_status() | |
| response.encoding = response.apparent_encoding # Detect encoding | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| self.title = soup.title.string if soup.title else None | |
| image_urls = [img["src"] for img in soup.find_all("img")] | |
| self.images = image_urls | |
| self.top_image = self.images[0] | |
| # Remove unwanted elements from the HTML | |
| for element in soup( | |
| ["img", "figcaption", "table", "script", "style"], | |
| ): | |
| element.extract() | |
| paragraphs = soup.find_all("p") | |
| self.text = " ".join([p.get_text() for p in paragraphs]) | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching URL: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Error parsing HTML content from {self.url}: {e}") | |
| return None | |
| def get_size(self): | |
| """ | |
| Retrieves the size of a URL's content using a HEAD request. | |
| """ | |
| try: | |
| response = requests.head( | |
| self.url, | |
| allow_redirects=True, | |
| timeout=5, | |
| ) | |
| response.raise_for_status() # Raise HTTPError for bad responses | |
| content_length = response.headers.get("Content-Length") | |
| if content_length is not None: | |
| return int(content_length) | |
| else: | |
| print("\t\tβββ Content-Length header not found") | |
| return None | |
| except requests.exceptions.RequestException as e: | |
| print(f"\t\tβββ Error getting URL size: {e}") | |
| return None | |
| if __name__ == "__main__": | |
| url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o" | |
| reader = URLReader(url) | |
| print(f"Title: {reader.title}") | |
| print(f"Text: {reader.text}") | |