|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
import os |
|
import logging |
|
import csv |
|
|
|
|
|
class GoodReadsScrapper: |
|
def __init__(self) -> None: |
|
self.BASE_URL = "https://www.goodreads.com/shelf/show/{}" |
|
self.GENRE_URL = "https://www.goodreads.com/genres/list?page={}" |
|
self.GOOD_READS_URL = "https://www.goodreads.com{}" |
|
self.CURRENT_INDEX = 0 |
|
|
|
def scrape_genres(self, filename="genres.txt"): |
|
if os.path.exists(filename): |
|
with open(filename, "r") as file: |
|
genres = [line.strip() for line in file.readlines()] |
|
print("Genres loaded from file.") |
|
else: |
|
page = 1 |
|
genres = [] |
|
|
|
while True: |
|
url = self.GENRE_URL.format(page) |
|
response = requests.get(url) |
|
|
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.text, "html.parser") |
|
container_div = soup.find("div", {"class": "leftContainer"}) |
|
genre_divs = container_div.find_all("div", {"class": "left"}) |
|
for genre_div in genre_divs: |
|
genre = genre_div.find( |
|
"a", {"class": "mediumText actionLinkLite"} |
|
) |
|
if genre: |
|
genre_text = genre.text.strip() |
|
genres.append(genre_text) |
|
print("Scrapped genre is:", genre_text) |
|
|
|
next_page_link = soup.find("a", {"class": "next_page"}) |
|
if next_page_link is None: |
|
break |
|
|
|
page += 1 |
|
|
|
with open(filename, "w") as file: |
|
for genre in genres: |
|
file.write(genre + "\n") |
|
print("Genres saved to file.") |
|
|
|
return genres |
|
|
|
def scrape_book(self, genre: str, csv_index: int): |
|
response = requests.get(self.BASE_URL.format(genre)) |
|
|
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.text, "html.parser") |
|
container_div = soup.find("div", {"class": "leftContainer"}) |
|
book_divs = container_div.findAll("div", {"class": "elementList"}) |
|
|
|
for book_div in book_divs: |
|
link = book_div.find("div", {"class": "left"}).find( |
|
"a", {"class": "leftAlignedImage"} |
|
)["href"] |
|
self.CURRENT_INDEX += 1 |
|
|
|
if self.CURRENT_INDEX <= csv_index: |
|
continue |
|
|
|
self.scrape_x_book(link, self.CURRENT_INDEX) |
|
|
|
def scrape_x_book(self, book_url: str, i: int): |
|
try: |
|
response = requests.get(self.GOOD_READS_URL.format(book_url)) |
|
|
|
if response.status_code != 200: |
|
logging.error( |
|
f"Failed to fetch {book_url}. Status code: {response.status_code}" |
|
) |
|
return |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
|
|
main_content = soup.find("div", {"class": "BookPage__mainContent"}) |
|
|
|
|
|
title = ( |
|
main_content.find("div", {"class": "BookPageTitleSection"}) |
|
.find("div", {"class": "BookPageTitleSection__title"}) |
|
.find("h1", {"class": "Text Text__title1"}) |
|
.text.strip() |
|
) |
|
|
|
|
|
metadata_section = main_content.find( |
|
"div", {"class": "BookPageMetadataSection"} |
|
) |
|
|
|
|
|
author = ( |
|
metadata_section.find( |
|
"div", {"class": "BookPageMetadataSection__contributor"} |
|
) |
|
.find("span", {"class": "ContributorLink__name"}) |
|
.text.strip() |
|
) |
|
|
|
|
|
rating = ( |
|
metadata_section.find( |
|
"div", {"class": "BookPageMetadataSection__ratingStats"} |
|
) |
|
.find("div", {"class": "RatingStatistics__column"}) |
|
.find("div", {"class": "RatingStatistics__rating"}) |
|
.text.strip() |
|
) |
|
|
|
|
|
description = ( |
|
metadata_section.find( |
|
"div", {"class": "BookPageMetadataSection__description"} |
|
) |
|
.find("div", {"class": "TruncatedContent"}) |
|
.find("span", {"class": "Formatted"}) |
|
.text.strip() |
|
) |
|
|
|
|
|
genres_list = [] |
|
genre_div = ( |
|
metadata_section.find( |
|
"div", {"class": "BookPageMetadataSection__genres"} |
|
) |
|
.find("ul", {"class": "CollapsableList"}) |
|
.findAll("span", {"class": "BookPageMetadataSection__genreButton"}) |
|
) |
|
|
|
for genre in genre_div: |
|
g = genre.find("span", {"class": "Button__labelItem"}).text.strip() |
|
genres_list.append(g) |
|
|
|
|
|
reviews = [] |
|
reviews_section = soup.find("div", {"class": "ReviewsSection"}).findAll( |
|
"div", {"class": "ReviewsList"} |
|
) |
|
articles = reviews_section[1].findAll("article", {"class": "ReviewCard"}) |
|
|
|
for article in articles: |
|
review_text = ( |
|
article.find("section", {"class": "ReviewText"}) |
|
.find("span", {"class": "Formatted"}) |
|
.text.strip() |
|
) |
|
reviews.append(review_text) |
|
|
|
|
|
csv_filename = "book_data.csv" |
|
with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile: |
|
fieldnames = [ |
|
"Id", |
|
"Title", |
|
"Author", |
|
"Rating", |
|
"Description", |
|
"Genres", |
|
"Reviews", |
|
] |
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
|
|
|
|
|
if csvfile.tell() == 0: |
|
writer.writeheader() |
|
|
|
writer.writerow( |
|
{ |
|
"Id": i, |
|
"Title": title, |
|
"Author": author, |
|
"Rating": rating, |
|
"Description": description, |
|
"Genres": ", ".join(genres_list), |
|
"Reviews": "\n".join(reviews), |
|
} |
|
) |
|
|
|
|
|
print(f"Processed book: {title} \nRecord: {i}") |
|
|
|
except Exception as e: |
|
logging.error(f"Error processing book {i}: {e}") |
|
|
|
|
|
def get_last_processed_id(self, csv_filename="book_data.csv"): |
|
try: |
|
if os.path.exists(csv_filename): |
|
df = pd.read_csv(csv_filename) |
|
if not df.empty and "Id" in df.columns: |
|
return df["Id"].max() |
|
except pd.errors.EmptyDataError: |
|
|
|
return 0 |
|
except Exception as e: |
|
|
|
print(f"Error reading CSV file: {e}") |
|
|
|
return 0 |
|
|
|
def main(self): |
|
genres = self.scrape_genres() |
|
last_processed_id = self.get_last_processed_id() |
|
for genre in genres: |
|
self.scrape_book(genre, last_processed_id) |
|
|
|
|
|
scrapper = GoodReadsScrapper() |
|
scrapper.main() |
|
|