Spaces:
Sleeping
Sleeping
import asyncio | |
import re | |
import aiohttp | |
from bs4 import BeautifulSoup | |
from loguru import logger | |
def extract_urls(text: str) -> list[str]: | |
url_pattern = r"(?P<url>https?:\/\/[^\s]+)" | |
urls = re.findall(url_pattern, text) | |
return urls | |
def parse_inner_text(html_string: str) -> str: | |
soup = BeautifulSoup(html_string, "lxml") | |
if content := soup.find("div", id="bodyContent"): | |
return content.get_text() | |
logger.warning("Could not parse the HTML content") | |
return "" | |
async def fetch(session: aiohttp.ClientSession, url: str) -> str: | |
async with session.get(url) as response: | |
html_string = await response.text() | |
return parse_inner_text(html_string) | |
async def fetch_all(urls: list[str]) -> str: | |
async with aiohttp.ClientSession() as session: | |
results = await asyncio.gather( | |
*[fetch(session, url) for url in urls], return_exceptions=True | |
) | |
success_results = [result for result in results if isinstance(result, str)] | |
if len(results) != len(success_results): | |
logger.warning("Some URLs could not be fetched") | |
return " ".join(success_results) |