Spaces:
Sleeping
Sleeping
Commit
·
01ad901
1
Parent(s):
d3449c0
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
from preprocessing import preprocess_reviews
|
5 |
+
from aspects_extraction import extract_aspects
|
6 |
+
from clustering import cluster_aspect_terms
|
7 |
+
import plotly.express as px
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
|
10 |
+
|
11 |
+
defaultCsv = {
|
12 |
+
'Serco USB Hub + Sound Card': 'reviews.csv',
|
13 |
+
'Honey': 'reviews_honey.csv',
|
14 |
+
}
|
15 |
+
|
16 |
+
st.set_page_config(
|
17 |
+
page_title="Actionble Insights From Reviews",
|
18 |
+
layout="wide",
|
19 |
+
|
20 |
+
)
|
21 |
+
|
22 |
+
@st.cache
|
23 |
+
def load_reviews(uploaded_file=None, default_file=None):
|
24 |
+
|
25 |
+
if default_file is not None:
|
26 |
+
reviews = pd.read_csv(default_file)
|
27 |
+
|
28 |
+
if uploaded_file is not None:
|
29 |
+
reviews = pd.read_csv(uploaded_file)
|
30 |
+
|
31 |
+
reviews = validate_reviews_dataframe(reviews)
|
32 |
+
|
33 |
+
return preprocess_reviews(reviews)
|
34 |
+
|
35 |
+
def validate_reviews_dataframe(r):
|
36 |
+
if 'title' not in r.columns:
|
37 |
+
raise ValueError("column title is required")
|
38 |
+
if 'review' not in r.columns:
|
39 |
+
raise ValueError("column review is required")
|
40 |
+
if 'rating' not in r.columns:
|
41 |
+
raise ValueError("column rating is required")
|
42 |
+
if r['title'].dtype != 'O':
|
43 |
+
raise ValueError("column title must be string")
|
44 |
+
if r['review'].dtype != 'O':
|
45 |
+
raise ValueError("column review must be string")
|
46 |
+
if r['rating'].dtype != 'float64':
|
47 |
+
raise ValueError("column rating must be float")
|
48 |
+
r = r.dropna()
|
49 |
+
if ((r['rating'] < 0) & (r['rating'] > 5)).any():
|
50 |
+
raise ValueError("values in column rating must be between 0 and 5")
|
51 |
+
return r
|
52 |
+
|
53 |
+
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
54 |
+
def load_model():
|
55 |
+
return spacy.load("en_core_web_lg")
|
56 |
+
|
57 |
+
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
58 |
+
def get_aspects(reviews):
|
59 |
+
nlp = load_model()
|
60 |
+
return extract_aspects(nlp, reviews)
|
61 |
+
|
62 |
+
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
63 |
+
def cluster_aspects(aspects):
|
64 |
+
nlp = load_model()
|
65 |
+
replacements = cluster_aspect_terms(nlp, aspects)
|
66 |
+
aspects['aspect'] = aspects['aspect'].map(replacements)
|
67 |
+
return aspects
|
68 |
+
|
69 |
+
def get_aspects_with_ratings(aspects, reviews):
|
70 |
+
aspect_with_ratings = pd.merge(aspects,
|
71 |
+
reviews[['rating']],
|
72 |
+
left_on='review_id',
|
73 |
+
right_index=True)
|
74 |
+
aspect_with_ratings['review_sentiment'] = pd.cut(aspect_with_ratings['rating'],
|
75 |
+
bins=[0, 3, 4, 5],
|
76 |
+
right=True,
|
77 |
+
labels=['Negative', 'Neutral', 'Positive']
|
78 |
+
)
|
79 |
+
return aspect_with_ratings
|
80 |
+
|
81 |
+
def get_aspect_treemap(aspects):
|
82 |
+
treemap = px.treemap(aspects.groupby(['aspect', 'opinion']).size().reset_index(),
|
83 |
+
path=[px.Constant('Aspects'), 'aspect', 'opinion'],
|
84 |
+
values=0,
|
85 |
+
)
|
86 |
+
treemap.update_layout(margin = dict(t=0, l=0, r=0, b=0))
|
87 |
+
return treemap
|
88 |
+
|
89 |
+
def plot_pain_points(aspect_with_ratings):
|
90 |
+
pain_points = (aspect_with_ratings
|
91 |
+
.query('review_sentiment == "Negative"')
|
92 |
+
.groupby('aspect')
|
93 |
+
.size()
|
94 |
+
.sort_values(ascending=False)[:10]
|
95 |
+
)
|
96 |
+
fig = px.bar(pain_points)
|
97 |
+
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
|
98 |
+
fig.update_traces(marker_color='red', showlegend=False)
|
99 |
+
return fig
|
100 |
+
|
101 |
+
def plot_gain_points(aspect_with_ratings):
|
102 |
+
gain_points = (aspect_with_ratings
|
103 |
+
.query('review_sentiment == "Positive"')
|
104 |
+
.groupby('aspect')
|
105 |
+
.size()
|
106 |
+
.sort_values(ascending=False)[:10]
|
107 |
+
)
|
108 |
+
fig = px.bar(gain_points)
|
109 |
+
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
|
110 |
+
fig.update_traces(marker_color='green', showlegend=False)
|
111 |
+
return fig
|
112 |
+
|
113 |
+
def plot_sentiment_by_aspect(aspect_with_ratings, top=15):
|
114 |
+
pivot = pd.crosstab(
|
115 |
+
index=aspect_with_ratings['aspect'],
|
116 |
+
columns=aspect_with_ratings['review_sentiment'],
|
117 |
+
margins=True,
|
118 |
+
).sort_values(by='All', ascending=False).iloc[1:, :-1]
|
119 |
+
|
120 |
+
fig = px.bar(pivot[:top], barmode='group', color_discrete_map={
|
121 |
+
'Positive': 'green',
|
122 |
+
'Negative': 'red',
|
123 |
+
'Neutral': 'blue',
|
124 |
+
})
|
125 |
+
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
|
126 |
+
return fig
|
127 |
+
|
128 |
+
|
129 |
+
st.write("## Actionble Insights From Reviews")
|
130 |
+
|
131 |
+
st.write("""
|
132 |
+
Key to building a successfull product is understanding what users want and what users don't want.
|
133 |
+
|
134 |
+
This insight can be useful in serveral ways.
|
135 |
+
|
136 |
+
1. Designing product that users actually want.
|
137 |
+
2. Fixing defects in product or addressing users pain points.
|
138 |
+
3. Staying ahead of the competition.
|
139 |
+
|
140 |
+
There are millions of reviews that people leave on sites like amazon, tripadvisor etc.
|
141 |
+
To gain insights from this data, you could either read all the reviews one by one or
|
142 |
+
let machine analyze these reviews and find main topics that user care about.
|
143 |
+
""")
|
144 |
+
|
145 |
+
st.write("## Extracting Aspect Opinion Pairs")
|
146 |
+
st.write("""
|
147 |
+
Let's say the customer wrote, `The material of the shirt is not soft`.
|
148 |
+
Here `material` is the `aspect` of shirt and `not soft` is the users `opinion`
|
149 |
+
about this aspect. The analyzer finds aspect opinion pairs from the reviews.
|
150 |
+
""")
|
151 |
+
|
152 |
+
st.write("### Customer Reviews")
|
153 |
+
st.write("""
|
154 |
+
Dataframe containing reviews of the customer. Title, review, and rating columns are required
|
155 |
+
""")
|
156 |
+
|
157 |
+
st.sidebar.title("Select Reviews File")
|
158 |
+
|
159 |
+
default_file = st.sidebar.selectbox(
|
160 |
+
"Choose Sample File",
|
161 |
+
defaultCsv.keys(),
|
162 |
+
)
|
163 |
+
if default_file is not None:
|
164 |
+
default_file = defaultCsv[default_file]
|
165 |
+
|
166 |
+
st.sidebar.write("<div style='text-align:center'>or</div>", unsafe_allow_html=True)
|
167 |
+
|
168 |
+
|
169 |
+
uploaded_file = st.sidebar.file_uploader(
|
170 |
+
'Choose a CSV File',
|
171 |
+
type='csv',
|
172 |
+
)
|
173 |
+
st.sidebar.write("CSV with title(string), review(string) and ratings(float 0-5) columns")
|
174 |
+
|
175 |
+
try:
|
176 |
+
reviews = load_reviews(uploaded_file, default_file)
|
177 |
+
st.write(reviews)
|
178 |
+
|
179 |
+
aspects = get_aspects(reviews)
|
180 |
+
aspects = cluster_aspects(aspects)
|
181 |
+
aspects_with_ratings = get_aspects_with_ratings(aspects, reviews)
|
182 |
+
|
183 |
+
st.write("### Extracted Aspect Opinion Pairs")
|
184 |
+
st.write("""
|
185 |
+
Treemap of aspect opinion pairs extracted from reviews, treemap
|
186 |
+
is sized according to number of reviews.
|
187 |
+
""")
|
188 |
+
st.plotly_chart(get_aspect_treemap(aspects), use_container_width=True)
|
189 |
+
|
190 |
+
|
191 |
+
st.write("### Pain Points And Gain Points")
|
192 |
+
col1, col2 = st.columns(2)
|
193 |
+
|
194 |
+
with col1:
|
195 |
+
st.write('Top Pain Points (by number of -ve reviews)')
|
196 |
+
st.plotly_chart(plot_pain_points(aspects_with_ratings), use_container_width=True)
|
197 |
+
|
198 |
+
with col2:
|
199 |
+
st.write('Top Gain Points (by number of +ve reviews)')
|
200 |
+
st.plotly_chart(plot_gain_points(aspects_with_ratings), use_container_width=True)
|
201 |
+
|
202 |
+
st.write("### Sentiment for each aspect")
|
203 |
+
st.write('(0-3 Negative) (4 Neutral) (5 Positive)')
|
204 |
+
st.plotly_chart(plot_sentiment_by_aspect(aspects_with_ratings), use_container_width=True)
|
205 |
+
except ValueError as e:
|
206 |
+
st.error(e)
|