Spaces:
Sleeping
Sleeping
import spacy | |
import pandas as pd | |
import streamlit as st | |
from preprocessing import preprocess_reviews | |
from aspects_extraction import extract_aspects | |
from clustering import cluster_aspect_terms | |
import plotly.express as px | |
import matplotlib.pyplot as plt | |
defaultCsv = { | |
'Serco USB Hub + Sound Card': 'reviews.csv', | |
'Honey': 'reviews_honey.csv', | |
} | |
st.set_page_config( | |
page_title="Actionble Insights From Reviews", | |
layout="wide", | |
) | |
def load_reviews(uploaded_file=None, default_file=None): | |
if default_file is not None: | |
reviews = pd.read_csv(default_file) | |
if uploaded_file is not None: | |
reviews = pd.read_csv(uploaded_file) | |
reviews = validate_reviews_dataframe(reviews) | |
return preprocess_reviews(reviews) | |
def validate_reviews_dataframe(r): | |
if 'title' not in r.columns: | |
raise ValueError("column title is required") | |
if 'review' not in r.columns: | |
raise ValueError("column review is required") | |
if 'rating' not in r.columns: | |
raise ValueError("column rating is required") | |
if r['title'].dtype != 'O': | |
raise ValueError("column title must be string") | |
if r['review'].dtype != 'O': | |
raise ValueError("column review must be string") | |
if r['rating'].dtype != 'float64': | |
raise ValueError("column rating must be float") | |
r = r.dropna() | |
if ((r['rating'] < 0) & (r['rating'] > 5)).any(): | |
raise ValueError("values in column rating must be between 0 and 5") | |
return r | |
def load_model(): | |
return spacy.load("en_core_web_lg") | |
def get_aspects(reviews): | |
nlp = load_model() | |
return extract_aspects(nlp, reviews) | |
def cluster_aspects(aspects): | |
nlp = load_model() | |
replacements = cluster_aspect_terms(nlp, aspects) | |
aspects['aspect'] = aspects['aspect'].map(replacements) | |
return aspects | |
def get_aspects_with_ratings(aspects, reviews): | |
aspect_with_ratings = pd.merge(aspects, | |
reviews[['rating']], | |
left_on='review_id', | |
right_index=True) | |
aspect_with_ratings['review_sentiment'] = pd.cut(aspect_with_ratings['rating'], | |
bins=[0, 3, 4, 5], | |
right=True, | |
labels=['Negative', 'Neutral', 'Positive'] | |
) | |
return aspect_with_ratings | |
def get_aspect_treemap(aspects): | |
treemap = px.treemap(aspects.groupby(['aspect', 'opinion']).size().reset_index(), | |
path=[px.Constant('Aspects'), 'aspect', 'opinion'], | |
values=0, | |
) | |
treemap.update_layout(margin = dict(t=0, l=0, r=0, b=0)) | |
return treemap | |
def plot_pain_points(aspect_with_ratings): | |
pain_points = (aspect_with_ratings | |
.query('review_sentiment == "Negative"') | |
.groupby('aspect') | |
.size() | |
.sort_values(ascending=False)[:10] | |
) | |
fig = px.bar(pain_points) | |
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0)) | |
fig.update_traces(marker_color='red', showlegend=False) | |
return fig | |
def plot_gain_points(aspect_with_ratings): | |
gain_points = (aspect_with_ratings | |
.query('review_sentiment == "Positive"') | |
.groupby('aspect') | |
.size() | |
.sort_values(ascending=False)[:10] | |
) | |
fig = px.bar(gain_points) | |
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0)) | |
fig.update_traces(marker_color='green', showlegend=False) | |
return fig | |
def plot_sentiment_by_aspect(aspect_with_ratings, top=15): | |
pivot = pd.crosstab( | |
index=aspect_with_ratings['aspect'], | |
columns=aspect_with_ratings['review_sentiment'], | |
margins=True, | |
).sort_values(by='All', ascending=False).iloc[1:, :-1] | |
fig = px.bar(pivot[:top], barmode='group', color_discrete_map={ | |
'Positive': 'green', | |
'Negative': 'red', | |
'Neutral': 'blue', | |
}) | |
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0)) | |
return fig | |
st.write("## Actionble Insights From Reviews") | |
st.write(""" | |
Key to building a successfull product is understanding what users want and what users don't want. | |
This insight can be useful in serveral ways. | |
1. Designing product that users actually want. | |
2. Fixing defects in product or addressing users pain points. | |
3. Staying ahead of the competition. | |
There are millions of reviews that people leave on sites like amazon, tripadvisor etc. | |
To gain insights from this data, you could either read all the reviews one by one or | |
let machine analyze these reviews and find main topics that user care about. | |
""") | |
st.write("## Extracting Aspect Opinion Pairs") | |
st.write(""" | |
Let's say the customer wrote, `The material of the shirt is not soft`. | |
Here `material` is the `aspect` of shirt and `not soft` is the users `opinion` | |
about this aspect. The analyzer finds aspect opinion pairs from the reviews. | |
""") | |
st.write("### Customer Reviews") | |
st.write(""" | |
Dataframe containing reviews of the customer. Title, review, and rating columns are required | |
""") | |
st.sidebar.title("Select Reviews File") | |
default_file = st.sidebar.selectbox( | |
"Choose Sample File", | |
defaultCsv.keys(), | |
) | |
if default_file is not None: | |
default_file = defaultCsv[default_file] | |
st.sidebar.write("<div style='text-align:center'>or</div>", unsafe_allow_html=True) | |
uploaded_file = st.sidebar.file_uploader( | |
'Choose a CSV File', | |
type='csv', | |
) | |
st.sidebar.write("CSV with title(string), review(string) and ratings(float 0-5) columns") | |
try: | |
reviews = load_reviews(uploaded_file, default_file) | |
st.write(reviews) | |
aspects = get_aspects(reviews) | |
aspects = cluster_aspects(aspects) | |
aspects_with_ratings = get_aspects_with_ratings(aspects, reviews) | |
st.write("### Extracted Aspect Opinion Pairs") | |
st.write(""" | |
Treemap of aspect opinion pairs extracted from reviews, treemap | |
is sized according to number of reviews. | |
""") | |
st.plotly_chart(get_aspect_treemap(aspects), use_container_width=True) | |
st.write("### Pain Points And Gain Points") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write('Top Pain Points (by number of -ve reviews)') | |
st.plotly_chart(plot_pain_points(aspects_with_ratings), use_container_width=True) | |
with col2: | |
st.write('Top Gain Points (by number of +ve reviews)') | |
st.plotly_chart(plot_gain_points(aspects_with_ratings), use_container_width=True) | |
st.write("### Sentiment for each aspect") | |
st.write('(0-3 Negative) (4 Neutral) (5 Positive)') | |
st.plotly_chart(plot_sentiment_by_aspect(aspects_with_ratings), use_container_width=True) | |
except ValueError as e: | |
st.error(e) |