Spaces:

analytics-jiten
/

reviews-insights

Sleeping

App Files Files Community

analytics-jiten commited on Jan 4, 2024

Commit

01ad901

1 Parent(s): d3449c0

Create app.py

Browse files

Files changed (1) hide show

app.py +206 -0

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import spacy
+import pandas as pd
+import streamlit as st
+from preprocessing import preprocess_reviews
+from aspects_extraction import extract_aspects
+from clustering import cluster_aspect_terms
+import plotly.express as px
+import matplotlib.pyplot as plt
+defaultCsv = {
+    'Serco USB Hub + Sound Card': 'reviews.csv',
+    'Honey': 'reviews_honey.csv',
+}
+st.set_page_config(
+    page_title="Actionble Insights From Reviews",
+    layout="wide",
+)
+@st.cache
+def load_reviews(uploaded_file=None, default_file=None):
+  if default_file is not None:
+    reviews = pd.read_csv(default_file)
+  if uploaded_file is not None:
+    reviews = pd.read_csv(uploaded_file)
+  reviews = validate_reviews_dataframe(reviews)
+  return preprocess_reviews(reviews)
+def validate_reviews_dataframe(r):
+  if 'title' not in r.columns:
+    raise ValueError("column title is required")
+  if 'review' not in r.columns:
+    raise ValueError("column review is required")
+  if 'rating' not in r.columns:
+    raise ValueError("column rating is required")
+  if r['title'].dtype != 'O':
+    raise ValueError("column title must be string")
+  if r['review'].dtype != 'O':
+    raise ValueError("column review must be string")
+  if r['rating'].dtype != 'float64':
+    raise ValueError("column rating must be float")
+  r = r.dropna()
+  if ((r['rating'] < 0) & (r['rating'] > 5)).any():
+    raise ValueError("values in column rating must be between 0 and 5")
+  return r
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def load_model():
+  return spacy.load("en_core_web_lg")
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def get_aspects(reviews):
+  nlp = load_model()
+  return extract_aspects(nlp, reviews)
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def cluster_aspects(aspects):
+  nlp = load_model()
+  replacements = cluster_aspect_terms(nlp, aspects)
+  aspects['aspect'] = aspects['aspect'].map(replacements)
+  return aspects
+def get_aspects_with_ratings(aspects, reviews):
+  aspect_with_ratings = pd.merge(aspects,
+  reviews[['rating']],
+  left_on='review_id',
+  right_index=True)
+  aspect_with_ratings['review_sentiment'] = pd.cut(aspect_with_ratings['rating'],
+        bins=[0, 3, 4, 5],
+        right=True,
+        labels=['Negative', 'Neutral', 'Positive']
+  )
+  return aspect_with_ratings
+def get_aspect_treemap(aspects):
+  treemap = px.treemap(aspects.groupby(['aspect', 'opinion']).size().reset_index(),
+      path=[px.Constant('Aspects'), 'aspect', 'opinion'],
+      values=0,
+  )
+  treemap.update_layout(margin = dict(t=0, l=0, r=0, b=0))
+  return treemap
+def plot_pain_points(aspect_with_ratings):
+  pain_points = (aspect_with_ratings
+    .query('review_sentiment == "Negative"')
+    .groupby('aspect')
+    .size()
+    .sort_values(ascending=False)[:10]
+  )
+  fig = px.bar(pain_points)
+  fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
+  fig.update_traces(marker_color='red', showlegend=False)
+  return fig
+def plot_gain_points(aspect_with_ratings):
+  gain_points = (aspect_with_ratings
+    .query('review_sentiment == "Positive"')
+    .groupby('aspect')
+    .size()
+    .sort_values(ascending=False)[:10]
+  )
+  fig = px.bar(gain_points)
+  fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
+  fig.update_traces(marker_color='green', showlegend=False)
+  return fig
+def plot_sentiment_by_aspect(aspect_with_ratings, top=15):
+  pivot = pd.crosstab(
+    index=aspect_with_ratings['aspect'],
+      columns=aspect_with_ratings['review_sentiment'],
+      margins=True,
+  ).sort_values(by='All', ascending=False).iloc[1:, :-1]
+  fig = px.bar(pivot[:top], barmode='group', color_discrete_map={
+      'Positive': 'green',
+      'Negative': 'red',
+      'Neutral': 'blue',
+  })
+  fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
+  return fig
+st.write("## Actionble Insights From Reviews")
+st.write("""
+Key to building a successfull product is understanding what users want and what users don't want.
+This insight can be useful in serveral ways.
+1. Designing product that users actually want.
+2. Fixing defects in product or addressing users pain points.
+3. Staying ahead of the competition.
+There are millions of reviews that people leave on sites like amazon, tripadvisor etc.
+To gain insights from this data, you could either read all the reviews one by one or
+let machine analyze these reviews and find main topics that user care about.
+""")
+st.write("## Extracting Aspect Opinion Pairs")
+st.write("""
+Let's say the customer wrote, `The material of the shirt is not soft`.
+Here `material` is the `aspect` of shirt and `not soft` is the users `opinion`
+about this aspect. The analyzer finds aspect opinion pairs from the reviews.
+""")
+st.write("### Customer Reviews")
+st.write("""
+Dataframe containing reviews of the customer. Title, review, and rating columns are required
+""")
+st.sidebar.title("Select Reviews File")
+default_file = st.sidebar.selectbox(
+    "Choose Sample File",
+    defaultCsv.keys(),
+)
+if default_file is not None:
+  default_file = defaultCsv[default_file]
+st.sidebar.write("<div style='text-align:center'>or</div>",  unsafe_allow_html=True)
+uploaded_file = st.sidebar.file_uploader(
+    'Choose a CSV File',
+    type='csv',
+)
+st.sidebar.write("CSV with title(string), review(string) and ratings(float 0-5) columns")
+try:
+  reviews = load_reviews(uploaded_file, default_file)
+  st.write(reviews)
+  aspects = get_aspects(reviews)
+  aspects = cluster_aspects(aspects)
+  aspects_with_ratings = get_aspects_with_ratings(aspects, reviews)
+  st.write("### Extracted Aspect Opinion Pairs")
+  st.write("""
+  Treemap of aspect opinion pairs extracted from reviews, treemap
+  is sized according to number of reviews.
+  """)
+  st.plotly_chart(get_aspect_treemap(aspects), use_container_width=True)
+  st.write("### Pain Points And Gain Points")
+  col1, col2 = st.columns(2)
+  with col1:
+    st.write('Top Pain Points (by number of -ve reviews)')
+    st.plotly_chart(plot_pain_points(aspects_with_ratings), use_container_width=True)
+  with col2:
+    st.write('Top Gain Points (by number of +ve reviews)')
+    st.plotly_chart(plot_gain_points(aspects_with_ratings), use_container_width=True)
+  st.write("### Sentiment for each aspect")
+  st.write('(0-3 Negative) (4 Neutral) (5 Positive)')
+  st.plotly_chart(plot_sentiment_by_aspect(aspects_with_ratings), use_container_width=True)
+except ValueError as e:
+  st.error(e)