analytics-jiten commited on
Commit
01ad901
·
1 Parent(s): d3449c0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import pandas as pd
3
+ import streamlit as st
4
+ from preprocessing import preprocess_reviews
5
+ from aspects_extraction import extract_aspects
6
+ from clustering import cluster_aspect_terms
7
+ import plotly.express as px
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ defaultCsv = {
12
+ 'Serco USB Hub + Sound Card': 'reviews.csv',
13
+ 'Honey': 'reviews_honey.csv',
14
+ }
15
+
16
+ st.set_page_config(
17
+ page_title="Actionble Insights From Reviews",
18
+ layout="wide",
19
+
20
+ )
21
+
22
+ @st.cache
23
+ def load_reviews(uploaded_file=None, default_file=None):
24
+
25
+ if default_file is not None:
26
+ reviews = pd.read_csv(default_file)
27
+
28
+ if uploaded_file is not None:
29
+ reviews = pd.read_csv(uploaded_file)
30
+
31
+ reviews = validate_reviews_dataframe(reviews)
32
+
33
+ return preprocess_reviews(reviews)
34
+
35
+ def validate_reviews_dataframe(r):
36
+ if 'title' not in r.columns:
37
+ raise ValueError("column title is required")
38
+ if 'review' not in r.columns:
39
+ raise ValueError("column review is required")
40
+ if 'rating' not in r.columns:
41
+ raise ValueError("column rating is required")
42
+ if r['title'].dtype != 'O':
43
+ raise ValueError("column title must be string")
44
+ if r['review'].dtype != 'O':
45
+ raise ValueError("column review must be string")
46
+ if r['rating'].dtype != 'float64':
47
+ raise ValueError("column rating must be float")
48
+ r = r.dropna()
49
+ if ((r['rating'] < 0) & (r['rating'] > 5)).any():
50
+ raise ValueError("values in column rating must be between 0 and 5")
51
+ return r
52
+
53
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
54
+ def load_model():
55
+ return spacy.load("en_core_web_lg")
56
+
57
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
58
+ def get_aspects(reviews):
59
+ nlp = load_model()
60
+ return extract_aspects(nlp, reviews)
61
+
62
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
63
+ def cluster_aspects(aspects):
64
+ nlp = load_model()
65
+ replacements = cluster_aspect_terms(nlp, aspects)
66
+ aspects['aspect'] = aspects['aspect'].map(replacements)
67
+ return aspects
68
+
69
+ def get_aspects_with_ratings(aspects, reviews):
70
+ aspect_with_ratings = pd.merge(aspects,
71
+ reviews[['rating']],
72
+ left_on='review_id',
73
+ right_index=True)
74
+ aspect_with_ratings['review_sentiment'] = pd.cut(aspect_with_ratings['rating'],
75
+ bins=[0, 3, 4, 5],
76
+ right=True,
77
+ labels=['Negative', 'Neutral', 'Positive']
78
+ )
79
+ return aspect_with_ratings
80
+
81
+ def get_aspect_treemap(aspects):
82
+ treemap = px.treemap(aspects.groupby(['aspect', 'opinion']).size().reset_index(),
83
+ path=[px.Constant('Aspects'), 'aspect', 'opinion'],
84
+ values=0,
85
+ )
86
+ treemap.update_layout(margin = dict(t=0, l=0, r=0, b=0))
87
+ return treemap
88
+
89
+ def plot_pain_points(aspect_with_ratings):
90
+ pain_points = (aspect_with_ratings
91
+ .query('review_sentiment == "Negative"')
92
+ .groupby('aspect')
93
+ .size()
94
+ .sort_values(ascending=False)[:10]
95
+ )
96
+ fig = px.bar(pain_points)
97
+ fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
98
+ fig.update_traces(marker_color='red', showlegend=False)
99
+ return fig
100
+
101
+ def plot_gain_points(aspect_with_ratings):
102
+ gain_points = (aspect_with_ratings
103
+ .query('review_sentiment == "Positive"')
104
+ .groupby('aspect')
105
+ .size()
106
+ .sort_values(ascending=False)[:10]
107
+ )
108
+ fig = px.bar(gain_points)
109
+ fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
110
+ fig.update_traces(marker_color='green', showlegend=False)
111
+ return fig
112
+
113
+ def plot_sentiment_by_aspect(aspect_with_ratings, top=15):
114
+ pivot = pd.crosstab(
115
+ index=aspect_with_ratings['aspect'],
116
+ columns=aspect_with_ratings['review_sentiment'],
117
+ margins=True,
118
+ ).sort_values(by='All', ascending=False).iloc[1:, :-1]
119
+
120
+ fig = px.bar(pivot[:top], barmode='group', color_discrete_map={
121
+ 'Positive': 'green',
122
+ 'Negative': 'red',
123
+ 'Neutral': 'blue',
124
+ })
125
+ fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
126
+ return fig
127
+
128
+
129
+ st.write("## Actionble Insights From Reviews")
130
+
131
+ st.write("""
132
+ Key to building a successfull product is understanding what users want and what users don't want.
133
+
134
+ This insight can be useful in serveral ways.
135
+
136
+ 1. Designing product that users actually want.
137
+ 2. Fixing defects in product or addressing users pain points.
138
+ 3. Staying ahead of the competition.
139
+
140
+ There are millions of reviews that people leave on sites like amazon, tripadvisor etc.
141
+ To gain insights from this data, you could either read all the reviews one by one or
142
+ let machine analyze these reviews and find main topics that user care about.
143
+ """)
144
+
145
+ st.write("## Extracting Aspect Opinion Pairs")
146
+ st.write("""
147
+ Let's say the customer wrote, `The material of the shirt is not soft`.
148
+ Here `material` is the `aspect` of shirt and `not soft` is the users `opinion`
149
+ about this aspect. The analyzer finds aspect opinion pairs from the reviews.
150
+ """)
151
+
152
+ st.write("### Customer Reviews")
153
+ st.write("""
154
+ Dataframe containing reviews of the customer. Title, review, and rating columns are required
155
+ """)
156
+
157
+ st.sidebar.title("Select Reviews File")
158
+
159
+ default_file = st.sidebar.selectbox(
160
+ "Choose Sample File",
161
+ defaultCsv.keys(),
162
+ )
163
+ if default_file is not None:
164
+ default_file = defaultCsv[default_file]
165
+
166
+ st.sidebar.write("<div style='text-align:center'>or</div>", unsafe_allow_html=True)
167
+
168
+
169
+ uploaded_file = st.sidebar.file_uploader(
170
+ 'Choose a CSV File',
171
+ type='csv',
172
+ )
173
+ st.sidebar.write("CSV with title(string), review(string) and ratings(float 0-5) columns")
174
+
175
+ try:
176
+ reviews = load_reviews(uploaded_file, default_file)
177
+ st.write(reviews)
178
+
179
+ aspects = get_aspects(reviews)
180
+ aspects = cluster_aspects(aspects)
181
+ aspects_with_ratings = get_aspects_with_ratings(aspects, reviews)
182
+
183
+ st.write("### Extracted Aspect Opinion Pairs")
184
+ st.write("""
185
+ Treemap of aspect opinion pairs extracted from reviews, treemap
186
+ is sized according to number of reviews.
187
+ """)
188
+ st.plotly_chart(get_aspect_treemap(aspects), use_container_width=True)
189
+
190
+
191
+ st.write("### Pain Points And Gain Points")
192
+ col1, col2 = st.columns(2)
193
+
194
+ with col1:
195
+ st.write('Top Pain Points (by number of -ve reviews)')
196
+ st.plotly_chart(plot_pain_points(aspects_with_ratings), use_container_width=True)
197
+
198
+ with col2:
199
+ st.write('Top Gain Points (by number of +ve reviews)')
200
+ st.plotly_chart(plot_gain_points(aspects_with_ratings), use_container_width=True)
201
+
202
+ st.write("### Sentiment for each aspect")
203
+ st.write('(0-3 Negative) (4 Neutral) (5 Positive)')
204
+ st.plotly_chart(plot_sentiment_by_aspect(aspects_with_ratings), use_container_width=True)
205
+ except ValueError as e:
206
+ st.error(e)