ladka6's picture
commit from ladka6
4aa3246
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import logging
import csv
class GoodReadsScrapper:
def __init__(self) -> None:
self.BASE_URL = "https://www.goodreads.com/shelf/show/{}"
self.GENRE_URL = "https://www.goodreads.com/genres/list?page={}"
self.GOOD_READS_URL = "https://www.goodreads.com{}"
self.CURRENT_INDEX = 0
def scrape_genres(self, filename="genres.txt"):
if os.path.exists(filename):
with open(filename, "r") as file:
genres = [line.strip() for line in file.readlines()]
print("Genres loaded from file.")
else:
page = 1
genres = []
while True:
url = self.GENRE_URL.format(page)
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
container_div = soup.find("div", {"class": "leftContainer"})
genre_divs = container_div.find_all("div", {"class": "left"})
for genre_div in genre_divs:
genre = genre_div.find(
"a", {"class": "mediumText actionLinkLite"}
)
if genre:
genre_text = genre.text.strip()
genres.append(genre_text)
print("Scrapped genre is:", genre_text)
next_page_link = soup.find("a", {"class": "next_page"})
if next_page_link is None:
break
page += 1
with open(filename, "w") as file:
for genre in genres:
file.write(genre + "\n")
print("Genres saved to file.")
return genres
def scrape_book(self, genre: str, csv_index: int):
response = requests.get(self.BASE_URL.format(genre))
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
container_div = soup.find("div", {"class": "leftContainer"})
book_divs = container_div.findAll("div", {"class": "elementList"})
for book_div in book_divs:
link = book_div.find("div", {"class": "left"}).find(
"a", {"class": "leftAlignedImage"}
)["href"]
self.CURRENT_INDEX += 1
if self.CURRENT_INDEX <= csv_index:
continue
self.scrape_x_book(link, self.CURRENT_INDEX)
def scrape_x_book(self, book_url: str, i: int):
try:
response = requests.get(self.GOOD_READS_URL.format(book_url))
if response.status_code != 200:
logging.error(
f"Failed to fetch {book_url}. Status code: {response.status_code}"
)
return
soup = BeautifulSoup(response.text, "html.parser")
main_content = soup.find("div", {"class": "BookPage__mainContent"})
# Get title
title = (
main_content.find("div", {"class": "BookPageTitleSection"})
.find("div", {"class": "BookPageTitleSection__title"})
.find("h1", {"class": "Text Text__title1"})
.text.strip()
)
# Metadata
metadata_section = main_content.find(
"div", {"class": "BookPageMetadataSection"}
)
# Author
author = (
metadata_section.find(
"div", {"class": "BookPageMetadataSection__contributor"}
)
.find("span", {"class": "ContributorLink__name"})
.text.strip()
)
# Rating
rating = (
metadata_section.find(
"div", {"class": "BookPageMetadataSection__ratingStats"}
)
.find("div", {"class": "RatingStatistics__column"})
.find("div", {"class": "RatingStatistics__rating"})
.text.strip()
)
# Description
description = (
metadata_section.find(
"div", {"class": "BookPageMetadataSection__description"}
)
.find("div", {"class": "TruncatedContent"})
.find("span", {"class": "Formatted"})
.text.strip()
)
# Genres List
genres_list = []
genre_div = (
metadata_section.find(
"div", {"class": "BookPageMetadataSection__genres"}
)
.find("ul", {"class": "CollapsableList"})
.findAll("span", {"class": "BookPageMetadataSection__genreButton"})
)
for genre in genre_div:
g = genre.find("span", {"class": "Button__labelItem"}).text.strip()
genres_list.append(g)
# Get Reviews
reviews = []
reviews_section = soup.find("div", {"class": "ReviewsSection"}).findAll(
"div", {"class": "ReviewsList"}
)
articles = reviews_section[1].findAll("article", {"class": "ReviewCard"})
for article in articles:
review_text = (
article.find("section", {"class": "ReviewText"})
.find("span", {"class": "Formatted"})
.text.strip()
)
reviews.append(review_text)
# Write to CSV
csv_filename = "book_data.csv"
with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
fieldnames = [
"Id",
"Title",
"Author",
"Rating",
"Description",
"Genres",
"Reviews",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# If the file is empty, write the header
if csvfile.tell() == 0:
writer.writeheader()
writer.writerow(
{
"Id": i,
"Title": title,
"Author": author,
"Rating": rating,
"Description": description,
"Genres": ", ".join(genres_list),
"Reviews": "\n".join(reviews),
}
)
# Log the processed book
print(f"Processed book: {title} \nRecord: {i}")
except Exception as e:
logging.error(f"Error processing book {i}: {e}")
# Optionally, you can log the error and continue to the next book
def get_last_processed_id(self, csv_filename="book_data.csv"):
try:
if os.path.exists(csv_filename):
df = pd.read_csv(csv_filename)
if not df.empty and "Id" in df.columns:
return df["Id"].max()
except pd.errors.EmptyDataError:
# Handle the case where the file is empty
return 0
except Exception as e:
# Handle other exceptions
print(f"Error reading CSV file: {e}")
return 0
def main(self):
genres = self.scrape_genres()
last_processed_id = self.get_last_processed_id()
for genre in genres:
self.scrape_book(genre, last_processed_id)
scrapper = GoodReadsScrapper()
scrapper.main()