File size: 7,757 Bytes

4aa3246

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import logging
import csv


class GoodReadsScrapper:
    def __init__(self) -> None:
        self.BASE_URL = "https://www.goodreads.com/shelf/show/{}"
        self.GENRE_URL = "https://www.goodreads.com/genres/list?page={}"
        self.GOOD_READS_URL = "https://www.goodreads.com{}"
        self.CURRENT_INDEX = 0

    def scrape_genres(self, filename="genres.txt"):
        if os.path.exists(filename):
            with open(filename, "r") as file:
                genres = [line.strip() for line in file.readlines()]
            print("Genres loaded from file.")
        else:
            page = 1
            genres = []

            while True:
                url = self.GENRE_URL.format(page)
                response = requests.get(url)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, "html.parser")
                    container_div = soup.find("div", {"class": "leftContainer"})
                    genre_divs = container_div.find_all("div", {"class": "left"})
                    for genre_div in genre_divs:
                        genre = genre_div.find(
                            "a", {"class": "mediumText actionLinkLite"}
                        )
                        if genre:
                            genre_text = genre.text.strip()
                            genres.append(genre_text)
                            print("Scrapped genre is:", genre_text)

                    next_page_link = soup.find("a", {"class": "next_page"})
                    if next_page_link is None:
                        break

                page += 1

            with open(filename, "w") as file:
                for genre in genres:
                    file.write(genre + "\n")
            print("Genres saved to file.")

        return genres

    def scrape_book(self, genre: str, csv_index: int):
        response = requests.get(self.BASE_URL.format(genre))

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            container_div = soup.find("div", {"class": "leftContainer"})
            book_divs = container_div.findAll("div", {"class": "elementList"})

            for book_div in book_divs:
                link = book_div.find("div", {"class": "left"}).find(
                    "a", {"class": "leftAlignedImage"}
                )["href"]
                self.CURRENT_INDEX += 1

                if self.CURRENT_INDEX <= csv_index:
                    continue

                self.scrape_x_book(link, self.CURRENT_INDEX)

    def scrape_x_book(self, book_url: str, i: int):
        try:
            response = requests.get(self.GOOD_READS_URL.format(book_url))

            if response.status_code != 200:
                logging.error(
                    f"Failed to fetch {book_url}. Status code: {response.status_code}"
                )
                return

            soup = BeautifulSoup(response.text, "html.parser")

            main_content = soup.find("div", {"class": "BookPage__mainContent"})

            # Get title
            title = (
                main_content.find("div", {"class": "BookPageTitleSection"})
                .find("div", {"class": "BookPageTitleSection__title"})
                .find("h1", {"class": "Text Text__title1"})
                .text.strip()
            )

            # Metadata
            metadata_section = main_content.find(
                "div", {"class": "BookPageMetadataSection"}
            )

            # Author
            author = (
                metadata_section.find(
                    "div", {"class": "BookPageMetadataSection__contributor"}
                )
                .find("span", {"class": "ContributorLink__name"})
                .text.strip()
            )

            # Rating
            rating = (
                metadata_section.find(
                    "div", {"class": "BookPageMetadataSection__ratingStats"}
                )
                .find("div", {"class": "RatingStatistics__column"})
                .find("div", {"class": "RatingStatistics__rating"})
                .text.strip()
            )

            # Description
            description = (
                metadata_section.find(
                    "div", {"class": "BookPageMetadataSection__description"}
                )
                .find("div", {"class": "TruncatedContent"})
                .find("span", {"class": "Formatted"})
                .text.strip()
            )

            # Genres List
            genres_list = []
            genre_div = (
                metadata_section.find(
                    "div", {"class": "BookPageMetadataSection__genres"}
                )
                .find("ul", {"class": "CollapsableList"})
                .findAll("span", {"class": "BookPageMetadataSection__genreButton"})
            )

            for genre in genre_div:
                g = genre.find("span", {"class": "Button__labelItem"}).text.strip()
                genres_list.append(g)

            # Get Reviews
            reviews = []
            reviews_section = soup.find("div", {"class": "ReviewsSection"}).findAll(
                "div", {"class": "ReviewsList"}
            )
            articles = reviews_section[1].findAll("article", {"class": "ReviewCard"})

            for article in articles:
                review_text = (
                    article.find("section", {"class": "ReviewText"})
                    .find("span", {"class": "Formatted"})
                    .text.strip()
                )
                reviews.append(review_text)

            # Write to CSV
            csv_filename = "book_data.csv"
            with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
                fieldnames = [
                    "Id",
                    "Title",
                    "Author",
                    "Rating",
                    "Description",
                    "Genres",
                    "Reviews",
                ]
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                # If the file is empty, write the header
                if csvfile.tell() == 0:
                    writer.writeheader()

                writer.writerow(
                    {
                        "Id": i,
                        "Title": title,
                        "Author": author,
                        "Rating": rating,
                        "Description": description,
                        "Genres": ", ".join(genres_list),
                        "Reviews": "\n".join(reviews),
                    }
                )

            # Log the processed book
            print(f"Processed book: {title} \nRecord: {i}")

        except Exception as e:
            logging.error(f"Error processing book {i}: {e}")
            # Optionally, you can log the error and continue to the next book

    def get_last_processed_id(self, csv_filename="book_data.csv"):
        try:
            if os.path.exists(csv_filename):
                df = pd.read_csv(csv_filename)
                if not df.empty and "Id" in df.columns:
                    return df["Id"].max()
        except pd.errors.EmptyDataError:
            # Handle the case where the file is empty
            return 0
        except Exception as e:
            # Handle other exceptions
            print(f"Error reading CSV file: {e}")

        return 0

    def main(self):
        genres = self.scrape_genres()
        last_processed_id = self.get_last_processed_id()
        for genre in genres:
            self.scrape_book(genre, last_processed_id)


scrapper = GoodReadsScrapper()
scrapper.main()