# Import necessary libraries. | |
import pandas as pd | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Read data source Excel files. | |
df1 = pd.read_excel('Online_Retail.xlsx') | |
# Check dataframe information. | |
#df1.info() | |
# Read header of dataframe. | |
#df1.head() | |
# Check any column containing the null value. | |
#df1.isnull().any() | |
# Count the number of null value records in the CustomerID column. | |
#df1['CustomerID'].isna().sum() | |
df1a = df1.dropna(subset=['CustomerID']) | |
# Check dataframe information. | |
#df1a.info() | |
# Read header of dataframe. | |
#df1a.head() | |
# Create CustomerID vs Item (Purchased Items, by StockCode) matrix by pivot table function. | |
CustomerID_Item_matrix = df1a.pivot_table( | |
index='CustomerID', | |
columns='StockCode', | |
values='Quantity', | |
aggfunc='sum' | |
) | |
# Display the shape of matrix, 4372 rows of CustomerID, 3684 columns of Item. | |
#CustomerID_Item_matrix.shape | |
# Update illustration of the matrix, 1 to represent customer have purchased item, 0 to represent customer haven't purchased. | |
CustomerID_Item_matrix = CustomerID_Item_matrix.applymap(lambda x: 1 if x > 0 else 0) | |
# Read header of CustomerID vs Item matrix. | |
#CustomerID_Item_matrix.loc[12680:].head() | |
# Create User to User similarity matrix. | |
user_to_user_similarity_matrix = pd.DataFrame( | |
cosine_similarity(CustomerID_Item_matrix) | |
) | |
# Display header of User to User similarity matrix. | |
#user_to_user_similarity_matrix.head() | |
# Update index to corresponding CustomerID. | |
user_to_user_similarity_matrix.columns = CustomerID_Item_matrix.index | |
user_to_user_similarity_matrix['CustomerID'] = CustomerID_Item_matrix.index | |
user_to_user_similarity_matrix = user_to_user_similarity_matrix.set_index('CustomerID') | |
# Display header of User to User similarity matrix. | |
#user_to_user_similarity_matrix.head() | |
# Randomly pick CustomerID (12702) to display the most similar CustomerID. | |
# The most similar CustomerID is 14608, which has 51% similarity. | |
#user_to_user_similarity_matrix.loc[12702.0].sort_values(ascending=False) | |
# Display CustomerID (12702) purchased items. | |
items_purchased_by_X = set(CustomerID_Item_matrix.loc[12702.0].iloc[ | |
CustomerID_Item_matrix.loc[12702.0].to_numpy().nonzero()].index) | |
#items_purchased_by_X | |
# Display CustomerID (14608) purchased items. | |
items_purchased_by_Y = set(CustomerID_Item_matrix.loc[14608.0].iloc[ | |
CustomerID_Item_matrix.loc[14608.0].to_numpy().nonzero()].index) | |
#items_purchased_by_Y | |
# Find out items which purchased by X (12702) but not yet purchased by Y (14608). | |
items_to_recommend_to_Y = items_purchased_by_X - items_purchased_by_Y | |
# Display the list of items recommended for Y (14608). | |
#items_to_recommend_to_Y | |
# Display the list of items recommended for Y (14608) with item Description. | |
print(df1a.loc[ | |
df1a['StockCode'].isin(items_to_recommend_to_Y), | |
['StockCode', 'Description'] | |
].drop_duplicates().set_index('StockCode')) | |