Spaces:
Runtime error
Runtime error
from bs4 import BeautifulSoup | |
from loguru import logger | |
from llm_engineering.domain.documents import ArticleDocument | |
from .base import BaseSeleniumCrawler | |
class MediumCrawler(BaseSeleniumCrawler): | |
model = ArticleDocument | |
def set_extra_driver_options(self, options) -> None: | |
options.add_argument(r"--profile-directory=Profile 2") | |
def extract(self, link: str, **kwargs) -> None: | |
old_model = self.model.find(link=link) | |
if old_model is not None: | |
logger.info(f"Article already exists in the database: {link}") | |
return | |
logger.info(f"Starting scrapping Medium article: {link}") | |
self.driver.get(link) | |
self.scroll_page() | |
soup = BeautifulSoup(self.driver.page_source, "html.parser") | |
title = soup.find_all("h1", class_="pw-post-title") | |
subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph") | |
data = { | |
"Title": title[0].string if title else None, | |
"Subtitle": subtitle[0].string if subtitle else None, | |
"Content": soup.get_text(), | |
} | |
self.driver.close() | |
user = kwargs["user"] | |
instance = self.model( | |
platform="medium", | |
content=data, | |
link=link, | |
author_id=user.id, | |
author_full_name=user.full_name, | |
) | |
instance.save() | |
logger.info(f"Successfully scraped and saved article: {link}") | |