In [38]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, SentencesDataset, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pickle
import nltk
import re



# Function for basic text preprocessing and tokenization
def preprocess_and_tokenize(text):
    # Remove non-English words
    english_words = set(nltk.corpus.words.words())
    text = " ".join(w if w.lower() in english_words or not w.isalpha() else '' for w in text.split())
    
    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    
    # Tokenization using nltk
    tokens = nltk.word_tokenize(text)
    
    return tokens

data = pd.read_csv('book_data.csv')

data = data.dropna(subset=['Description', 'Reviews'])

# Combine "Description" and "Reviews" columns for training
data['combined_text'] = data['Description'] + ' ' + data['Reviews']

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

model_name = 'sentence-transformers/paraphrase-MiniLM-L6-v2'
model = SentenceTransformer(model_name, device='cpu')

train_examples = []
for index, row in train_data.iterrows():
    combined_text_tokens = preprocess_and_tokenize(row['combined_text'])
    input_example = InputExample(texts=[combined_text_tokens, combined_text_tokens], label=float(index))
    train_examples.append(input_example)

train_dataset = SentencesDataset(train_examples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
train_loss = losses.CosineSimilarityLoss(model=model)

model_save_path = 'fine_tuned_model_combined'
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    output_path=model_save_path
)

fine_tuned_model = SentenceTransformer(model_save_path)

print('MODEL TRAINED')

vector_database = {}
for index, row in data.iterrows():
    combined_text_tokens = preprocess_and_tokenize(row['combined_text'])
    embedding = fine_tuned_model.encode(combined_text_tokens)
    vector_database[index] = embedding

print('PKL IS SAVING')
with open('vector_database_combined.pkl', 'wb') as file:
    pickle.dump(vector_database, file)
print('PKL IS SAVED')
print('MODEL SAVING')
fine_tuned_model.save('fine_tuned_model_combined')
print('MODEL IS SAVED')


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

MODEL TRAINED


KeyboardInterrupt: 

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from sentence_transformers import SentencesDataset, losses
from torch.utils.data import DataLoader
from sentence_transformers import InputExample
from sentence_transformers.util import pytorch_cos_sim
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import torch
def semantic_search(query, vector_database, model):
    query_embedding = model.encode(query)
    
    # Convert NumPy arrays to PyTorch tensors
    corpus_embeddings = torch.stack([torch.from_numpy(embedding) for embedding in vector_database.values()])
    
    # Use the PyTorch tensors in semantic_search
    hits = util.semantic_search(query_embedding, corpus_embeddings)
    
    results = [{'index': hit['corpus_id'], 'score': hit['score']} for hit in hits[0]]
    return results

def rerank_results(query, results, model):
    query_embedding = model.encode(query)
    reranked_results = sorted(results, key=lambda x: pytorch_cos_sim(query_embedding, vector_database[x['index']]), reverse=True)
    return reranked_results

query = "Romance and Adventure"
results = semantic_search(query, vector_database, fine_tuned_model)
reranked_results = rerank_results(query, results, fine_tuned_model)

for result in reranked_results:
    index = result['index']
    print(f"Title: {data.iloc[index]['Title']}, Score: {result['score']}")
    print(f"Description: {data.iloc[index]['Description']}")
    print("-----------")

Title: The Bible of Karate: Bubishi, Score: 0.9921729564666748
Description: Philosophy, strategy and medicine as related to the martial arts, translated with commentary.
-----------
Title: Antony and Cleopatra, Score: 0.9921565055847168
Description: A romantic tragedy about the relationship between Mark Antony and the Queen of Egypt, including notes and critical commentary.
-----------
Title: Last Chance to See, Score: 0.9920206069946289
Description: Join author Douglas Adams and zoologist Mark Carwardine as they take off around the world in search of exotic, endangered creatures.
-----------
Title: Out for Blood, Score: 0.9934567809104919
Description: Harvey, Alyxandra
-----------
Title: Man-eating Leopard of Rudraprayag, Score: 0.9949787855148315
Description: book, paperback, leopard
-----------
Title: A Visual Dictionary of Architecture, Score: 0.9938178658485413
Description: A Visual Dictionary of Architecture
-----------
Title: Rock the Beginning, Score: 0.994428813457489
Descript