import asyncio import re import aiohttp from bs4 import BeautifulSoup from loguru import logger def extract_urls(text: str) -> list[str]: url_pattern = r"(?Phttps?:\/\/[^\s]+)" urls = re.findall(url_pattern, text) return urls def parse_inner_text(html_string: str) -> str: soup = BeautifulSoup(html_string, "lxml") if content := soup.find("div", id="bodyContent"): return content.get_text() logger.warning("Could not parse the HTML content") return "" async def fetch(session: aiohttp.ClientSession, url: str) -> str: async with session.get(url) as response: html_string = await response.text() return parse_inner_text(html_string) async def fetch_all(urls: list[str]) -> str: async with aiohttp.ClientSession() as session: results = await asyncio.gather( *[fetch(session, url) for url in urls], return_exceptions=True ) success_results = [result for result in results if isinstance(result, str)] if len(results) != len(success_results): logger.warning("Some URLs could not be fetched") return " ".join(success_results)