{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/ladka6/Projects/semantic-search/venv/lib/python3.9/site-packages/urllib3/__init__.py:34: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", " warnings.warn(\n" ] } ], "source": [ "import re\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "from langdetect import detect\n", "from sentence_transformers import SentenceTransformer, InputExample, losses\n", "from sentence_transformers.util import semantic_search \n", "from torch.utils.data import DataLoader\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem import PorterStemmer\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "import faiss\n", "from FlagEmbedding import FlagReranker\n", "from sklearn.model_selection import train_test_split\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "FILE_PATH = '../data/book_data.csv'\n", "\n", "df = pd.read_csv(FILE_PATH)\n", "\n", "df = df.dropna(subset=['Title', 'Description', 'Genres'])\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", " tokens = word_tokenize(text)\n", " stop_words = set(stopwords.words('english'))\n", " tokens = [token for token in tokens if token not in stop_words]\n", " text = ' '.join(tokens)\n", " return text\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# def title_description_genres_to_string(row):\n", "# genres = ' '.join([genre.strip() for genre in row['Genres'].split(',')])\n", "\n", "# title = row['Title']\n", "\n", "# descriptions = str(row['Description'])\n", "\n", "# tokens = word_tokenize(descriptions)\n", "\n", "# tokens = [word.lower() for word in tokens if word.isalpha()]\n", "\n", "# stop_words = set(stopwords.words('english'))\n", "# tokens = [word for word in tokens if word not in stop_words]\n", "\n", "# porter = PorterStemmer()\n", "# tokens = [porter.stem(word) for word in tokens]\n", "\n", "# preprocessed_text = ' '.join(tokens)\n", "\n", "# return \"%s %s %s\" %(title, genres, preprocessed_text)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Id | \n", "Title | \n", "Author | \n", "Rating | \n", "Description | \n", "Genres | \n", "Reviews | \n", "Combined | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Beowulf | \n", "Seamus Heaney | \n", "3.48 | \n", "Composed toward the end of the first millenniu... | \n", "Classics, Poetry, Fiction, Fantasy, Mythology,... | \n", "*bum bum* IN A WORLD . . . *bum bum* . . . FUL... | \n", "Beowulf Classics Poetry Fiction Fantasy Mythol... | \n", "
1 | \n", "2 | \n", "The Evening and the Morning | \n", "Ken Follett | \n", "4.38 | \n", "The thrilling and addictive prequel to The Pil... | \n", "Historical Fiction, Fiction, Historical, Audio... | \n", "It's 997 CE, the end of the Dark Ages in Engla... | \n", "The Evening and the Morning Historical Fiction... | \n", "
2 | \n", "3 | \n", "The Abbot's Tale | \n", "Conn Iggulden | \n", "4.05 | \n", "In the year 937, the new king of England, a gr... | \n", "Historical Fiction, Fiction, Historical, Medie... | \n", "There is never one truth, one love, or one ene... | \n", "The Abbot's Tale Historical Fiction Fiction Hi... | \n", "
3 | \n", "4 | \n", "Ibn Fadlān and the Land of Darkness: Arab Trav... | \n", "Ahmad ibn Fadlān | \n", "3.87 | \n", "In 922 AD, an Arab envoy from Baghdad named Ib... | \n", "History, Travel, Nonfiction, Classics, Islam, ... | \n", "رسالة ابن فضلان .. أو ما يسمى برحلة ابن فضلان ... | \n", "Ibn Fadlān and the Land of Darkness: Arab Trav... | \n", "
4 | \n", "5 | \n", "The Empty Throne | \n", "Bernard Cornwell | \n", "4.38 | \n", "This eighth entry in New York Times bestsellin... | \n", "Historical Fiction, Fiction, Historical, Medie... | \n", "The Empty Throne was an improvement over The P... | \n", "The Empty Throne Historical Fiction Fiction Hi... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1780 | \n", "1843 | \n", "The Soul of an Octopus | \n", "Sy Montgomery | \n", "3.93 | \n", "In pursuit of the wild, solitary, predatory oc... | \n", "Nonfiction, Science, Animals, Nature, Memoir, ... | \n", "I'm kind of \"eh\" on this book. It bills itself... | \n", "The Soul of an Octopus Nonfiction Science Anim... | \n", "
1781 | \n", "1844 | \n", "The Jim Corbett Omnibus: \"Man-eaters of Kumaon... | \n", "Jim Corbett | \n", "4.54 | \n", "Jim Corbett's riveting accounts of shikar in t... | \n", "Nonfiction, Wildlife, Nature, Travel, Biograph... | \n", "One of the best books ever written, this book ... | \n", "The Jim Corbett Omnibus: \"Man-eaters of Kumaon... | \n", "
1782 | \n", "1845 | \n", "The Tiger: A True Story of Vengeance and Survival | \n", "John Vaillant | \n", "4.07 | \n", "It’s December 1997, and a man-eating tiger is ... | \n", "Nonfiction, History, Nature, Animals, Russia, ... | \n", "Fearful symmetry indeed. In 1997, during time ... | \n", "The Tiger: A True Story of Vengeance and Survi... | \n", "
1783 | \n", "1846 | \n", "100 Heartbeats: The Race to Save Earth's Most ... | \n", "Jeff Corwin | \n", "4.17 | \n", "It's no secret that our planet―and the delicat... | \n", "Nonfiction, Animals, Science, Nature, Conserva... | \n", "I learned a lot reading this book. Frightening... | \n", "100 Heartbeats: The Race to Save Earth's Most ... | \n", "
1784 | \n", "1847 | \n", "An Elephant in My Kitchen | \n", "Françoise Malby-Anthony | \n", "4.39 | \n", "A blonde, chic Parisienne, Francoise never exp... | \n", "Nonfiction, Animals, Memoir, Africa, Nature, B... | \n", "Thula Thula, South Africa, the sanctuary for e... | \n", "An Elephant in My Kitchen Nonfiction Animals M... | \n", "
1779 rows × 8 columns
\n", "