semantic_search / scrappers /scrapper.py

commit from ladka6

4aa3246 over 1 year ago

7.76 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import os
	import logging
	import csv


	class GoodReadsScrapper:
	def __init__(self) -> None:
	self.BASE_URL = "https://www.goodreads.com/shelf/show/{}"
	self.GENRE_URL = "https://www.goodreads.com/genres/list?page={}"
	self.GOOD_READS_URL = "https://www.goodreads.com{}"
	self.CURRENT_INDEX = 0

	def scrape_genres(self, filename="genres.txt"):
	if os.path.exists(filename):
	with open(filename, "r") as file:
	genres = [line.strip() for line in file.readlines()]
	print("Genres loaded from file.")
	else:
	page = 1
	genres = []

	while True:
	url = self.GENRE_URL.format(page)
	response = requests.get(url)

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	container_div = soup.find("div", {"class": "leftContainer"})
	genre_divs = container_div.find_all("div", {"class": "left"})
	for genre_div in genre_divs:
	genre = genre_div.find(
	"a", {"class": "mediumText actionLinkLite"}
	)
	if genre:
	genre_text = genre.text.strip()
	genres.append(genre_text)
	print("Scrapped genre is:", genre_text)

	next_page_link = soup.find("a", {"class": "next_page"})
	if next_page_link is None:
	break

	page += 1

	with open(filename, "w") as file:
	for genre in genres:
	file.write(genre + "\n")
	print("Genres saved to file.")

	return genres

	def scrape_book(self, genre: str, csv_index: int):
	response = requests.get(self.BASE_URL.format(genre))

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	container_div = soup.find("div", {"class": "leftContainer"})
	book_divs = container_div.findAll("div", {"class": "elementList"})

	for book_div in book_divs:
	link = book_div.find("div", {"class": "left"}).find(
	"a", {"class": "leftAlignedImage"}
	)["href"]
	self.CURRENT_INDEX += 1

	if self.CURRENT_INDEX <= csv_index:
	continue

	self.scrape_x_book(link, self.CURRENT_INDEX)

	def scrape_x_book(self, book_url: str, i: int):
	try:
	response = requests.get(self.GOOD_READS_URL.format(book_url))

	if response.status_code != 200:
	logging.error(
	f"Failed to fetch {book_url}. Status code: {response.status_code}"
	)
	return

	soup = BeautifulSoup(response.text, "html.parser")

	main_content = soup.find("div", {"class": "BookPage__mainContent"})

	# Get title
	title = (
	main_content.find("div", {"class": "BookPageTitleSection"})
	.find("div", {"class": "BookPageTitleSection__title"})
	.find("h1", {"class": "Text Text__title1"})
	.text.strip()
	)

	# Metadata
	metadata_section = main_content.find(
	"div", {"class": "BookPageMetadataSection"}
	)

	# Author
	author = (
	metadata_section.find(
	"div", {"class": "BookPageMetadataSection__contributor"}
	)
	.find("span", {"class": "ContributorLink__name"})
	.text.strip()
	)

	# Rating
	rating = (
	metadata_section.find(
	"div", {"class": "BookPageMetadataSection__ratingStats"}
	)
	.find("div", {"class": "RatingStatistics__column"})
	.find("div", {"class": "RatingStatistics__rating"})
	.text.strip()
	)

	# Description
	description = (
	metadata_section.find(
	"div", {"class": "BookPageMetadataSection__description"}
	)
	.find("div", {"class": "TruncatedContent"})
	.find("span", {"class": "Formatted"})
	.text.strip()
	)

	# Genres List
	genres_list = []
	genre_div = (
	metadata_section.find(
	"div", {"class": "BookPageMetadataSection__genres"}
	)
	.find("ul", {"class": "CollapsableList"})
	.findAll("span", {"class": "BookPageMetadataSection__genreButton"})
	)

	for genre in genre_div:
	g = genre.find("span", {"class": "Button__labelItem"}).text.strip()
	genres_list.append(g)

	# Get Reviews
	reviews = []
	reviews_section = soup.find("div", {"class": "ReviewsSection"}).findAll(
	"div", {"class": "ReviewsList"}
	)
	articles = reviews_section[1].findAll("article", {"class": "ReviewCard"})

	for article in articles:
	review_text = (
	article.find("section", {"class": "ReviewText"})
	.find("span", {"class": "Formatted"})
	.text.strip()
	)
	reviews.append(review_text)

	# Write to CSV
	csv_filename = "book_data.csv"
	with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
	fieldnames = [
	"Id",
	"Title",
	"Author",
	"Rating",
	"Description",
	"Genres",
	"Reviews",
	]
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	# If the file is empty, write the header
	if csvfile.tell() == 0:
	writer.writeheader()

	writer.writerow(
	{
	"Id": i,
	"Title": title,
	"Author": author,
	"Rating": rating,
	"Description": description,
	"Genres": ", ".join(genres_list),
	"Reviews": "\n".join(reviews),
	}
	)

	# Log the processed book
	print(f"Processed book: {title} \nRecord: {i}")

	except Exception as e:
	logging.error(f"Error processing book {i}: {e}")
	# Optionally, you can log the error and continue to the next book

	def get_last_processed_id(self, csv_filename="book_data.csv"):
	try:
	if os.path.exists(csv_filename):
	df = pd.read_csv(csv_filename)
	if not df.empty and "Id" in df.columns:
	return df["Id"].max()
	except pd.errors.EmptyDataError:
	# Handle the case where the file is empty
	return 0
	except Exception as e:
	# Handle other exceptions
	print(f"Error reading CSV file: {e}")

	return 0

	def main(self):
	genres = self.scrape_genres()
	last_processed_id = self.get_last_processed_id()
	for genre in genres:
	self.scrape_book(genre, last_processed_id)


	scrapper = GoodReadsScrapper()
	scrapper.main()