genai_service / scraper.py
ahmed-eisa's picture
fixed scraper file name
eaeb210
import asyncio
import re
import aiohttp
from bs4 import BeautifulSoup
from loguru import logger
def extract_urls(text: str) -> list[str]:
url_pattern = r"(?P<url>https?:\/\/[^\s]+)"
urls = re.findall(url_pattern, text)
return urls
def parse_inner_text(html_string: str) -> str:
soup = BeautifulSoup(html_string, "lxml")
if content := soup.find("div", id="bodyContent"):
return content.get_text()
logger.warning("Could not parse the HTML content")
return ""
async def fetch(session: aiohttp.ClientSession, url: str) -> str:
async with session.get(url) as response:
html_string = await response.text()
return parse_inner_text(html_string)
async def fetch_all(urls: list[str]) -> str:
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[fetch(session, url) for url in urls], return_exceptions=True
)
success_results = [result for result in results if isinstance(result, str)]
if len(results) != len(success_results):
logger.warning("Some URLs could not be fetched")
return " ".join(success_results)