import requests from bs4 import BeautifulSoup import pandas as pd import os import logging import csv class GoodReadsScrapper: def __init__(self) -> None: self.BASE_URL = "https://www.goodreads.com/shelf/show/{}" self.GENRE_URL = "https://www.goodreads.com/genres/list?page={}" self.GOOD_READS_URL = "https://www.goodreads.com{}" self.CURRENT_INDEX = 0 def scrape_genres(self, filename="genres.txt"): if os.path.exists(filename): with open(filename, "r") as file: genres = [line.strip() for line in file.readlines()] print("Genres loaded from file.") else: page = 1 genres = [] while True: url = self.GENRE_URL.format(page) response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") container_div = soup.find("div", {"class": "leftContainer"}) genre_divs = container_div.find_all("div", {"class": "left"}) for genre_div in genre_divs: genre = genre_div.find( "a", {"class": "mediumText actionLinkLite"} ) if genre: genre_text = genre.text.strip() genres.append(genre_text) print("Scrapped genre is:", genre_text) next_page_link = soup.find("a", {"class": "next_page"}) if next_page_link is None: break page += 1 with open(filename, "w") as file: for genre in genres: file.write(genre + "\n") print("Genres saved to file.") return genres def scrape_book(self, genre: str, csv_index: int): response = requests.get(self.BASE_URL.format(genre)) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") container_div = soup.find("div", {"class": "leftContainer"}) book_divs = container_div.findAll("div", {"class": "elementList"}) for book_div in book_divs: link = book_div.find("div", {"class": "left"}).find( "a", {"class": "leftAlignedImage"} )["href"] self.CURRENT_INDEX += 1 if self.CURRENT_INDEX <= csv_index: continue self.scrape_x_book(link, self.CURRENT_INDEX) def scrape_x_book(self, book_url: str, i: int): try: response = requests.get(self.GOOD_READS_URL.format(book_url)) if response.status_code != 200: logging.error( f"Failed to fetch {book_url}. Status code: {response.status_code}" ) return soup = BeautifulSoup(response.text, "html.parser") main_content = soup.find("div", {"class": "BookPage__mainContent"}) # Get title title = ( main_content.find("div", {"class": "BookPageTitleSection"}) .find("div", {"class": "BookPageTitleSection__title"}) .find("h1", {"class": "Text Text__title1"}) .text.strip() ) # Metadata metadata_section = main_content.find( "div", {"class": "BookPageMetadataSection"} ) # Author author = ( metadata_section.find( "div", {"class": "BookPageMetadataSection__contributor"} ) .find("span", {"class": "ContributorLink__name"}) .text.strip() ) # Rating rating = ( metadata_section.find( "div", {"class": "BookPageMetadataSection__ratingStats"} ) .find("div", {"class": "RatingStatistics__column"}) .find("div", {"class": "RatingStatistics__rating"}) .text.strip() ) # Description description = ( metadata_section.find( "div", {"class": "BookPageMetadataSection__description"} ) .find("div", {"class": "TruncatedContent"}) .find("span", {"class": "Formatted"}) .text.strip() ) # Genres List genres_list = [] genre_div = ( metadata_section.find( "div", {"class": "BookPageMetadataSection__genres"} ) .find("ul", {"class": "CollapsableList"}) .findAll("span", {"class": "BookPageMetadataSection__genreButton"}) ) for genre in genre_div: g = genre.find("span", {"class": "Button__labelItem"}).text.strip() genres_list.append(g) # Get Reviews reviews = [] reviews_section = soup.find("div", {"class": "ReviewsSection"}).findAll( "div", {"class": "ReviewsList"} ) articles = reviews_section[1].findAll("article", {"class": "ReviewCard"}) for article in articles: review_text = ( article.find("section", {"class": "ReviewText"}) .find("span", {"class": "Formatted"}) .text.strip() ) reviews.append(review_text) # Write to CSV csv_filename = "book_data.csv" with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile: fieldnames = [ "Id", "Title", "Author", "Rating", "Description", "Genres", "Reviews", ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # If the file is empty, write the header if csvfile.tell() == 0: writer.writeheader() writer.writerow( { "Id": i, "Title": title, "Author": author, "Rating": rating, "Description": description, "Genres": ", ".join(genres_list), "Reviews": "\n".join(reviews), } ) # Log the processed book print(f"Processed book: {title} \nRecord: {i}") except Exception as e: logging.error(f"Error processing book {i}: {e}") # Optionally, you can log the error and continue to the next book def get_last_processed_id(self, csv_filename="book_data.csv"): try: if os.path.exists(csv_filename): df = pd.read_csv(csv_filename) if not df.empty and "Id" in df.columns: return df["Id"].max() except pd.errors.EmptyDataError: # Handle the case where the file is empty return 0 except Exception as e: # Handle other exceptions print(f"Error reading CSV file: {e}") return 0 def main(self): genres = self.scrape_genres() last_processed_id = self.get_last_processed_id() for genre in genres: self.scrape_book(genre, last_processed_id) scrapper = GoodReadsScrapper() scrapper.main()