Spaces:

shresht8
/

sentiment-analysis-excel

Running

File size: 10,242 Bytes

import gradio as gr
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict

# Load model and tokenizer globally for efficiency
model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


def predict_sentiment(texts):
    """
    Predict sentiment for a list of texts
    """
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {
        0: "Very Negative",
        1: "Negative",
        2: "Neutral",
        3: "Positive",
        4: "Very Positive"
    }
    return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]


def process_single_sheet(df, product_name):
    """
    Process a single dataframe and return sentiment analysis results
    """
    if 'Reviews' not in df.columns:
        raise ValueError(f"'Reviews' column not found in sheet/file for {product_name}")

    reviews = df['Reviews'].fillna("")
    sentiments = predict_sentiment(reviews.tolist())
    df['Sentiment'] = sentiments

    # Calculate sentiment distribution
    sentiment_counts = pd.Series(sentiments).value_counts()

    return df, sentiment_counts


def create_comparison_charts(sentiment_results):
    """
    Create investment-focused comparison charts for different products
    """
    # Prepare data for plotting
    plot_data = []
    for product, sentiment_counts in sentiment_results.items():
        # Convert to dictionary and get sum
        sentiment_dict = sentiment_counts.to_dict()
        total = sum(sentiment_dict.values())

        row = {
            'Product': product,
            'Total Reviews': total
        }
        # Calculate percentages for each sentiment
        for sentiment, count in sentiment_dict.items():
            row[sentiment] = (count / total) * 100
        plot_data.append(row)

    df = pd.DataFrame(plot_data)

    # Ensure all sentiment columns exist (in case some products don't have all sentiments)
    for sentiment in ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']:
        if sentiment not in df.columns:
            df[sentiment] = 0

    # Calculate weighted sentiment score (0 to 100)
    sentiment_weights = {
        'Very Negative': 0,
        'Negative': 25,
        'Neutral': 50,
        'Positive': 75,
        'Very Positive': 100
    }

    df['Sentiment Score'] = 0
    for product in df['Product']:
        score = 0
        for sentiment, weight in sentiment_weights.items():
            if sentiment in df.columns:
                score += (df.loc[df['Product'] == product, sentiment].iloc[0] * weight / 100)
        df.loc[df['Product'] == product, 'Sentiment Score'] = round(score, 2)

    # Create sentiment score chart
    score_fig = go.Figure()
    score_fig.add_trace(go.Bar(
        x=df['Product'],
        y=df['Sentiment Score'],
        text=df['Sentiment Score'].round(1),
        textposition='auto',
        marker_color='rgb(65, 105, 225)'
    ))
    score_fig.update_layout(
        title='Overall Sentiment Score by Product (0-100)',
        yaxis_title='Weighted Sentiment Score',
        yaxis_range=[0, 100],
        showlegend=False
    )

    # Calculate Positive-Negative Ratios
    df['Positive Ratio'] = df[['Positive', 'Very Positive']].sum(axis=1)
    df['Negative Ratio'] = df[['Negative', 'Very Negative']].sum(axis=1)

    # Create Positive-Negative ratio chart
    ratio_fig = go.Figure()
    ratio_fig.add_trace(go.Bar(
        name='Positive',
        x=df['Product'],
        y=df['Positive Ratio'],
        marker_color='rgb(50, 205, 50)'
    ))
    ratio_fig.add_trace(go.Bar(
        name='Negative',
        x=df['Product'],
        y=df['Negative Ratio'],
        marker_color='rgb(220, 20, 60)'
    ))
    ratio_fig.update_layout(
        barmode='group',
        title='Positive vs Negative Sentiment Ratio by Product',
        yaxis_title='Percentage (%)'
    )

    # Create summary table with investment-relevant metrics
    summary_df = pd.DataFrame({
        'Product': df['Product'],
        'Total Reviews': df['Total Reviews'],
        'Sentiment Score (0-100)': df['Sentiment Score'],
        'Positive Ratio (%)': df['Positive Ratio'].round(2),
        'Negative Ratio (%)': df['Negative Ratio'].round(2),
        'Neutral Ratio (%)': df['Neutral'].round(2)
    })

    # Calculate Confidence Score (avoiding division by zero)
    summary_df['Confidence Score'] = ((summary_df['Positive Ratio (%)'] + summary_df['Negative Ratio (%)']) /
                                      summary_df['Neutral Ratio (%)'].replace(0, 0.001)).round(2)

    # Sort by Sentiment Score for easy comparison
    summary_df = summary_df.sort_values('Sentiment Score (0-100)', ascending=False)

    return score_fig, ratio_fig, summary_df


def process_file(file_obj):
    """
    Process the input file and add sentiment analysis results
    """
    try:
        file_path = file_obj.name
        sentiment_results = defaultdict(pd.Series)
        all_processed_dfs = {}

        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            product_name = "Product"  # Default name for CSV
            processed_df, sentiment_counts = process_single_sheet(df, product_name)
            all_processed_dfs[product_name] = processed_df
            sentiment_results[product_name] = sentiment_counts

        elif file_path.endswith(('.xlsx', '.xls')):
            excel_file = pd.ExcelFile(file_path)
            for sheet_name in excel_file.sheet_names:
                df = pd.read_excel(file_path, sheet_name=sheet_name)
                processed_df, sentiment_counts = process_single_sheet(df, sheet_name)
                all_processed_dfs[sheet_name] = processed_df
                sentiment_results[sheet_name] = sentiment_counts
        else:
            raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")

        # Create visualizations
        distribution_plot, summary_table = create_comparison_charts(sentiment_results)

        # Save results
        output_path = "sentiment_analysis_results.xlsx"
        with pd.ExcelWriter(output_path) as writer:
            for sheet_name, df in all_processed_dfs.items():
                df.to_excel(writer, sheet_name=sheet_name, index=False)
            summary_table.to_excel(writer, sheet_name='Summary', index=False)

        return (
            distribution_plot,
            summary_table,
            output_path
        )

    except Exception as e:
        raise gr.Error(str(e))


# Create Gradio interface
# In the Gradio interface section
def create_comparison_charts(sentiment_results):
    """
    Create simplified, investment-focused comparison charts
    """
    # Prepare data
    plot_data = []
    for product, sentiment_counts in sentiment_results.items():
        sentiment_dict = sentiment_counts.to_dict()
        total = sum(sentiment_dict.values())

        row = {
            'Product': product,
            'Total Reviews': total
        }
        for sentiment, count in sentiment_dict.items():
            row[sentiment] = (count / total) * 100
        plot_data.append(row)

    df = pd.DataFrame(plot_data)

    # Ensure all sentiment columns exist
    for sentiment in ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']:
        if sentiment not in df.columns:
            df[sentiment] = 0

    # 1. Simple Stacked Bar Chart showing sentiment distribution
    stack_fig = go.Figure()
    sentiments = ['Very Positive', 'Positive', 'Neutral', 'Negative', 'Very Negative']
    colors = ['rgb(39, 174, 96)', 'rgb(46, 204, 113)',
              'rgb(241, 196, 15)', 'rgb(231, 76, 60)',
              'rgb(192, 57, 43)']

    for sentiment, color in zip(sentiments, colors):
        stack_fig.add_trace(go.Bar(
            name=sentiment,
            x=df['Product'],
            y=df[sentiment],
            marker_color=color
        ))

    stack_fig.update_layout(
        barmode='stack',
        title='Sentiment Distribution by Product',
        yaxis_title='Percentage (%)'
    )

    # 2. Aggregated Sentiment Ratios for Quick Comparison
    df['Positive_Total'] = df[['Positive', 'Very Positive']].sum(axis=1)
    df['Negative_Total'] = df[['Negative', 'Very Negative']].sum(axis=1)

    summary_df = pd.DataFrame({
        'Product': df['Product'],
        'Total Reviews': df['Total Reviews'],
        'Positive (%)': df['Positive_Total'].round(2),
        'Neutral (%)': df['Neutral'].round(2),
        'Negative (%)': df['Negative_Total'].round(2)
    })

    # Sort by Positive percentage for easy comparison
    summary_df = summary_df.sort_values('Positive (%)', ascending=False)

    return stack_fig, summary_df


# Update the Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# Product Review Sentiment Analysis")

    gr.Markdown("""
    ### Quick Guide
    1. **Excel File (Multiple Products)**:
       - Create separate sheets for each product
       - Name sheets with product/company names
       - Include "Reviews" column in each sheet

    2. **CSV File (Single Product)**:
       - Include "Reviews" column

    Upload your file and click Analyze to get started.
    """)

    with gr.Row():
        file_input = gr.File(
            label="Upload File (CSV or Excel)",
            file_types=[".csv", ".xlsx", ".xls"]
        )

    with gr.Row():
        analyze_btn = gr.Button("Analyze Sentiments")

    with gr.Row():
        distribution_plot = gr.Plot(label="Sentiment Distribution")

    with gr.Row():
        summary_table = gr.Dataframe(label="Summary Metrics")

    with gr.Row():
        output_file = gr.File(label="Download Full Report")

    analyze_btn.click(
        fn=process_file,
        inputs=[file_input],
        outputs=[distribution_plot, summary_table, output_file]
    )

# launch interface
interface.launch()