purpleriann's picture
Upload folder using huggingface_hub
a22e84b verified
from bs4 import BeautifulSoup
from loguru import logger
from llm_engineering.domain.documents import ArticleDocument
from .base import BaseSeleniumCrawler
class MediumCrawler(BaseSeleniumCrawler):
model = ArticleDocument
def set_extra_driver_options(self, options) -> None:
options.add_argument(r"--profile-directory=Profile 2")
def extract(self, link: str, **kwargs) -> None:
old_model = self.model.find(link=link)
if old_model is not None:
logger.info(f"Article already exists in the database: {link}")
return
logger.info(f"Starting scrapping Medium article: {link}")
self.driver.get(link)
self.scroll_page()
soup = BeautifulSoup(self.driver.page_source, "html.parser")
title = soup.find_all("h1", class_="pw-post-title")
subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph")
data = {
"Title": title[0].string if title else None,
"Subtitle": subtitle[0].string if subtitle else None,
"Content": soup.get_text(),
}
self.driver.close()
user = kwargs["user"]
instance = self.model(
platform="medium",
content=data,
link=link,
author_id=user.id,
author_full_name=user.full_name,
)
instance.save()
logger.info(f"Successfully scraped and saved article: {link}")