File size: 1,146 Bytes
6d73c15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import asyncio
import re

import aiohttp
from bs4 import BeautifulSoup
from loguru import logger

def extract_urls(text: str) -> list[str]:
    url_pattern = r"(?P<url>https?:\/\/[^\s]+)" 
    urls = re.findall(url_pattern, text) 
    return urls


def parse_inner_text(html_string: str) -> str:
    soup = BeautifulSoup(html_string, "lxml")
    if content := soup.find("div", id="bodyContent"): 
        return content.get_text()
    logger.warning("Could not parse the HTML content")
    return ""


async def fetch(session: aiohttp.ClientSession, url: str) -> str:
    async with session.get(url) as response: 
        html_string = await response.text()
        return parse_inner_text(html_string)


async def fetch_all(urls: list[str]) -> str:
    async with aiohttp.ClientSession() as session: 
        results = await asyncio.gather(
            *[fetch(session, url) for url in urls], return_exceptions=True
        )
    success_results = [result for result in results if isinstance(result, str)]
    if len(results) != len(success_results): 
        logger.warning("Some URLs could not be fetched")
    return " ".join(success_results)