File size: 9,042 Bytes

4aa3246

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import os\n",
    "import logging\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GoodReadsScrapper:\n",
    "    def __init__(self) -> None:\n",
    "        self.BASE_URL = 'https://www.goodreads.com/shelf/show/{}'\n",
    "        self.GENRE_URL = 'https://www.goodreads.com/genres/list?page={}'\n",
    "        self.GOOD_READS_URL = 'https://www.goodreads.com{}'\n",
    "        self.CURRENT_INDEX = 0\n",
    "    \n",
    "    def scrape_genres(self,filename='genres.txt'):\n",
    "        if os.path.exists(filename):\n",
    "            with open(filename, 'r') as file:\n",
    "                genres = [line.strip() for line in file.readlines()]\n",
    "            print(\"Genres loaded from file.\")\n",
    "        else:\n",
    "            page = 1\n",
    "            genres = []\n",
    "\n",
    "            while True:\n",
    "                url = self.GENRE_URL.format(page)\n",
    "                response = requests.get(url)\n",
    "\n",
    "                if response.status_code == 200:\n",
    "                    soup = BeautifulSoup(response.text, 'html.parser')\n",
    "                    container_div = soup.find('div', {'class': 'leftContainer'})\n",
    "                    genre_divs = container_div.find_all('div', {'class': 'left'})\n",
    "                    for genre_div in genre_divs:\n",
    "                        genre = genre_div.find('a', {'class': 'mediumText actionLinkLite'})\n",
    "                        if genre:\n",
    "                            genre_text = genre.text.strip()\n",
    "                            genres.append(genre_text)\n",
    "                            print(\"Scrapped genre is:\", genre_text)\n",
    "\n",
    "                    next_page_link = soup.find('a', {'class': 'next_page'})\n",
    "                    if next_page_link is None:\n",
    "                        break\n",
    "\n",
    "                page += 1\n",
    "\n",
    "            with open(filename, 'w') as file:\n",
    "                for genre in genres:\n",
    "                    file.write(genre + '\\n')\n",
    "            print(\"Genres saved to file.\")\n",
    "\n",
    "        return genres\n",
    "    \n",
    "    def scrape_book(self,genre: str, csv_index: int):\n",
    "        response = requests.get(self.BASE_URL.format(genre))\n",
    "        \n",
    "        if response.status_code == 200:\n",
    "            soup = BeautifulSoup(response.text, 'html.parser')\n",
    "            container_div = soup.find('div', {'class': 'leftContainer'}) \n",
    "            book_divs = container_div.findAll('div', {'class': 'elementList'})\n",
    "            \n",
    "            for book_div in book_divs:\n",
    "                link = book_div.find('div', {'class': 'left'}).find('a', {'class': 'leftAlignedImage'})['href']\n",
    "                self.CURRENT_INDEX += 1\n",
    "                \n",
    "                if self.CURRENT_INDEX <= csv_index:\n",
    "                    continue\n",
    "                \n",
    "                self.scrape_x_book(link,self.CURRENT_INDEX)\n",
    "    \n",
    "    def scrape_x_book(self,book_url: str, i: int):\n",
    "        try:\n",
    "            response = requests.get(self.GOOD_READS_URL.format(book_url))\n",
    "\n",
    "            if response.status_code != 200:\n",
    "                logging.error(f\"Failed to fetch {book_url}. Status code: {response.status_code}\")\n",
    "                return\n",
    "\n",
    "            soup = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "            main_content = soup.find('div', {'class': 'BookPage__mainContent'})\n",
    "\n",
    "            # Get title\n",
    "            title = main_content.find('div', {'class': 'BookPageTitleSection'}).find('div', {'class': 'BookPageTitleSection__title'}).find('h1', {'class': 'Text Text__title1'}).text.strip()\n",
    "\n",
    "            # Metadata\n",
    "            metadata_section = main_content.find('div', {'class': 'BookPageMetadataSection'})\n",
    "\n",
    "            # Author\n",
    "            author = metadata_section.find('div', {'class': 'BookPageMetadataSection__contributor'}).find('span', {'class': 'ContributorLink__name'}).text.strip()\n",
    "\n",
    "            # Rating\n",
    "            rating = metadata_section.find('div', {'class': 'BookPageMetadataSection__ratingStats'}).find('div', {'class': 'RatingStatistics__column'}).find('div', {'class': 'RatingStatistics__rating'}).text.strip()\n",
    "\n",
    "            # Description\n",
    "            description = metadata_section.find('div', {'class': 'BookPageMetadataSection__description'}).find('div', {'class': 'TruncatedContent'}).find('span', {'class': 'Formatted'}).text.strip()\n",
    "\n",
    "            # Genres List\n",
    "            genres_list = []\n",
    "            genre_div = metadata_section.find('div', {'class': 'BookPageMetadataSection__genres'}).find('ul', {'class': 'CollapsableList'}).findAll('span', {'class': 'BookPageMetadataSection__genreButton'})\n",
    "\n",
    "            for genre in genre_div:\n",
    "                g = genre.find('span', {'class': 'Button__labelItem'}).text.strip()\n",
    "                genres_list.append(g)\n",
    "\n",
    "            # Get Reviews\n",
    "            reviews = []\n",
    "            reviews_section = soup.find('div', {'class': 'ReviewsSection'}).findAll('div', {'class': 'ReviewsList'})\n",
    "            articles = reviews_section[1].findAll('article', {'class': 'ReviewCard'})\n",
    "\n",
    "            for article in articles:\n",
    "                review_text = article.find('section', {'class': 'ReviewText'}).find('span', {'class': 'Formatted'}).text.strip()\n",
    "                reviews.append(review_text)\n",
    "\n",
    "            # Write to CSV\n",
    "            csv_filename = 'book_data.csv'\n",
    "            with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:\n",
    "                fieldnames = ['Id', 'Title', 'Author', 'Rating', 'Description', 'Genres', 'Reviews']\n",
    "                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
    "\n",
    "                # If the file is empty, write the header\n",
    "                if csvfile.tell() == 0:\n",
    "                    writer.writeheader()\n",
    "\n",
    "                writer.writerow({\n",
    "                    'Id': i,\n",
    "                    'Title': title,\n",
    "                    'Author': author,\n",
    "                    'Rating': rating,\n",
    "                    'Description': description,\n",
    "                    'Genres': ', '.join(genres_list),\n",
    "                    'Reviews': '\\n'.join(reviews)\n",
    "                })\n",
    "\n",
    "            # Log the processed book\n",
    "            print(f\"Processed book: {title} \\nRecord: {i}\")\n",
    "\n",
    "        except Exception as e:\n",
    "            logging.error(f\"Error processing book {i}: {e}\")\n",
    "            # Optionally, you can log the error and continue to the next book\n",
    "\n",
    "    def get_last_processed_id(self, csv_filename='book_data.csv'):\n",
    "        try:\n",
    "            if os.path.exists(csv_filename):\n",
    "                df = pd.read_csv(csv_filename)\n",
    "                if not df.empty and 'Id' in df.columns:\n",
    "                    return df['Id'].max()\n",
    "        except pd.errors.EmptyDataError:\n",
    "            # Handle the case where the file is empty\n",
    "            return 0\n",
    "        except Exception as e:\n",
    "            # Handle other exceptions\n",
    "            print(f\"Error reading CSV file: {e}\")\n",
    "        \n",
    "        return 0\n",
    "    \n",
    "    def main(self):\n",
    "        genres = self.scrape_genres()\n",
    "        last_processed_id = self.get_last_processed_id()\n",
    "        for genre in genres:\n",
    "            self.scrape_book(genre, last_processed_id)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scrapper = GoodReadsScrapper()\n",
    "scrapper.main()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}