Spaces:

ahmed-eisa
/

genai_service

Sleeping

genai_service / scraper.py

fixed scraper file name

eaeb210 10 days ago

1.15 kB

	import asyncio
	import re

	import aiohttp
	from bs4 import BeautifulSoup
	from loguru import logger

	def extract_urls(text: str) -> list[str]:
	url_pattern = r"(?P<url>https?:\/\/[^\s]+)"
	urls = re.findall(url_pattern, text)
	return urls


	def parse_inner_text(html_string: str) -> str:
	soup = BeautifulSoup(html_string, "lxml")
	if content := soup.find("div", id="bodyContent"):
	return content.get_text()
	logger.warning("Could not parse the HTML content")
	return ""


	async def fetch(session: aiohttp.ClientSession, url: str) -> str:
	async with session.get(url) as response:
	html_string = await response.text()
	return parse_inner_text(html_string)


	async def fetch_all(urls: list[str]) -> str:
	async with aiohttp.ClientSession() as session:
	results = await asyncio.gather(
	*[fetch(session, url) for url in urls], return_exceptions=True
	)
	success_results = [result for result in results if isinstance(result, str)]
	if len(results) != len(success_results):
	logger.warning("Some URLs could not be fetched")
	return " ".join(success_results)