File size: 2,896 Bytes
8c3633d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

# Import necessary libraries.
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Read data source Excel files.
df1 = pd.read_excel('Online_Retail.xlsx')

# Check dataframe information.
#df1.info()

# Read header of dataframe.
#df1.head()

# Check any column containing the null value.
#df1.isnull().any()

# Count the number of null value records in the CustomerID column.
#df1['CustomerID'].isna().sum()

df1a = df1.dropna(subset=['CustomerID'])

# Check dataframe information.
#df1a.info()

# Read header of dataframe.
#df1a.head()

# Create CustomerID vs Item (Purchased Items, by StockCode) matrix by pivot table function.
CustomerID_Item_matrix = df1a.pivot_table(
    index='CustomerID', 
    columns='StockCode', 
    values='Quantity',
    aggfunc='sum'
)

# Display the shape of matrix, 4372 rows of CustomerID, 3684 columns of Item.
#CustomerID_Item_matrix.shape

# Update illustration of the matrix, 1 to represent customer have purchased item, 0 to represent customer haven't purchased.
CustomerID_Item_matrix = CustomerID_Item_matrix.applymap(lambda x: 1 if x > 0 else 0)

# Read header of CustomerID vs Item matrix.
#CustomerID_Item_matrix.loc[12680:].head()

# Create User to User similarity matrix. 
user_to_user_similarity_matrix = pd.DataFrame(
    cosine_similarity(CustomerID_Item_matrix)
)

# Display header of User to User similarity matrix. 
#user_to_user_similarity_matrix.head()

# Update index to corresponding CustomerID.
user_to_user_similarity_matrix.columns = CustomerID_Item_matrix.index
user_to_user_similarity_matrix['CustomerID'] = CustomerID_Item_matrix.index
user_to_user_similarity_matrix = user_to_user_similarity_matrix.set_index('CustomerID')

# Display header of User to User similarity matrix. 
#user_to_user_similarity_matrix.head()

# Randomly pick CustomerID (12702) to display the most similar CustomerID.
# The most similar CustomerID is 14608, which has 51% similarity.
#user_to_user_similarity_matrix.loc[12702.0].sort_values(ascending=False)

# Display CustomerID (12702) purchased items.
items_purchased_by_X = set(CustomerID_Item_matrix.loc[12702.0].iloc[
    CustomerID_Item_matrix.loc[12702.0].to_numpy().nonzero()].index)
#items_purchased_by_X

# Display CustomerID (14608) purchased items.
items_purchased_by_Y = set(CustomerID_Item_matrix.loc[14608.0].iloc[
    CustomerID_Item_matrix.loc[14608.0].to_numpy().nonzero()].index)
#items_purchased_by_Y

# Find out items which purchased by X (12702) but not yet purchased by Y (14608).
items_to_recommend_to_Y = items_purchased_by_X - items_purchased_by_Y

# Display the list of items recommended for Y (14608).
#items_to_recommend_to_Y

# Display the list of items recommended for Y (14608) with item Description.
print(df1a.loc[
    df1a['StockCode'].isin(items_to_recommend_to_Y), 
    ['StockCode', 'Description']
].drop_duplicates().set_index('StockCode'))