Spaces:

mgbam
/

DataBiz

Sleeping

App Files Files Community

mgbam commited on Jan 27

Commit

6e8a7d4

1 Parent(s): bdbd063

Add application file

Browse files

Files changed (1) hide show

app.py +483 -0

app.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# app.py
+import streamlit as st
+import numpy as np
+import pandas as pd
+from smolagents import CodeAgent, tool
+from typing import Union, List, Dict, Optional
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+import os
+from groq import Groq
+from dataclasses import dataclass
+import tempfile
+import base64
+import io
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
+import joblib
+import pdfkit  # Ensure wkhtmltopdf is available in the environment
+import uuid  # For generating unique report IDs
+# ------------------------------
+# Language Model Interface
+# ------------------------------
+class GroqLLM:
+    """Enhanced LLM interface with support for generating natural language summaries."""
+    def __init__(self, model_name="llama-3.1-8B-Instant"):
+        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+        self.model_name = model_name
+    def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
+        """Make the class callable as required by smolagents"""
+        try:
+            # Handle different prompt formats
+            if isinstance(prompt, (dict, list)):
+                prompt_str = str(prompt)
+            else:
+                prompt_str = str(prompt)
+            # Create a properly formatted message
+            completion = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{
+                    "role": "user",
+                    "content": prompt_str
+                }],
+                temperature=0.7,
+                max_tokens=1500,  # Increased tokens for detailed responses
+                stream=False
+            )
+            return completion.choices[0].message.content if completion.choices else "Error: No response generated"
+        except Exception as e:
+            error_msg = f"Error generating response: {str(e)}"
+            print(error_msg)
+            return error_msg
+# ------------------------------
+# Data Analysis Agent
+# ------------------------------
+class DataAnalysisAgent(CodeAgent):
+    """Extended CodeAgent with dataset awareness and predictive analytics capabilities."""
+    def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._dataset = dataset
+        self.models = {}  # To store trained models
+    @property
+    def dataset(self) -> pd.DataFrame:
+        """Access the stored dataset"""
+        return self._dataset
+    def run(self, prompt: str) -> str:
+        """Override run method to include dataset context and support predictive tasks"""
+        dataset_info = f"""
+        Dataset Shape: {self.dataset.shape}
+        Columns: {', '.join(self.dataset.columns)}
+        Data Types: {self.dataset.dtypes.to_dict()}
+        """
+        enhanced_prompt = f"""
+        Analyze the following dataset:
+        {dataset_info}
+        Task: {prompt}
+        Use the provided tools to analyze this specific dataset and return detailed results.
+        """
+        return super().run(enhanced_prompt)
+# ------------------------------
+# Tool Definitions
+# ------------------------------
+@tool
+def analyze_basic_stats(data: pd.DataFrame) -> str:
+    """Calculate and visualize basic statistical measures for numerical columns."""
+    if data is None:
+        data = tool.agent.dataset
+    stats = {}
+    numeric_cols = data.select_dtypes(include=[np.number]).columns
+    for col in numeric_cols:
+        stats[col] = {
+            'mean': float(data[col].mean()),
+            'median': float(data[col].median()),
+            'std': float(data[col].std()),
+            'skew': float(data[col].skew()),
+            'missing': int(data[col].isnull().sum())
+        }
+    # Generate a summary DataFrame
+    stats_df = pd.DataFrame(stats).T
+    stats_df.reset_index(inplace=True)
+    stats_df.rename(columns={'index': 'Feature'}, inplace=True)
+    # Plotting basic statistics
+    fig, ax = plt.subplots(figsize=(10, 6))
+    stats_df.set_index('Feature')[['mean', 'median', 'std']].plot(kind='bar', ax=ax)
+    plt.title('Basic Statistics')
+    plt.ylabel('Values')
+    plt.tight_layout()
+    # Save plot to buffer
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    stats_plot = base64.b64encode(buf.getvalue()).decode()
+    return f"### Basic Statistics\n{stats_df.to_markdown()} \n\n![Basic Statistics](data:image/png;base64,{stats_plot})"
+@tool
+def generate_correlation_matrix(data: pd.DataFrame) -> str:
+    """Generate an interactive correlation matrix using Plotly."""
+    if data is None:
+        data = tool.agent.dataset
+    numeric_data = data.select_dtypes(include=[np.number])
+    corr = numeric_data.corr()
+    fig = px.imshow(corr,
+                    text_auto=True,
+                    aspect="auto",
+                    color_continuous_scale='RdBu',
+                    title='Correlation Matrix')
+    fig.update_layout(width=800, height=600)
+    # Convert Plotly figure to HTML div
+    correlation_html = fig.to_html(full_html=False)
+    return correlation_html
+@tool
+def analyze_categorical_columns(data: pd.DataFrame) -> str:
+    """Analyze categorical columns with visualizations."""
+    if data is None:
+        data = tool.agent.dataset
+    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
+    analysis = {}
+    plots = ""
+    for col in categorical_cols:
+        unique_vals = data[col].nunique()
+        top_categories = data[col].value_counts().head(5).to_dict()
+        missing = data[col].isnull().sum()
+        analysis[col] = {
+            'unique_values': int(unique_vals),
+            'top_categories': top_categories,
+            'missing': int(missing)
+        }
+        # Generate bar chart for top categories
+        fig, ax = plt.subplots(figsize=(8, 4))
+        sns.countplot(data=data, x=col, order=data[col].value_counts().iloc[:5].index, ax=ax)
+        plt.title(f'Top 5 Categories in {col}')
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png')
+        plt.close()
+        plot_img = base64.b64encode(buf.getvalue()).decode()
+        plots += f"### {col}\n"
+        plots += f"- **Unique Values:** {unique_vals}\n"
+        plots += f"- **Missing Values:** {missing}\n"
+        plots += f"- **Top Categories:** {top_categories}\n"
+        plots += f"![Top Categories in {col}](data:image/png;base64,{plot_img})\n\n"
+    return plots + f"### Categorical Columns Analysis\n{pd.DataFrame(analysis).T.to_markdown()}"
+@tool
+def suggest_features(data: pd.DataFrame) -> str:
+    """Suggest potential feature engineering steps based on data characteristics."""
+    if data is None:
+        data = tool.agent.dataset
+    suggestions = []
+    numeric_cols = data.select_dtypes(include=[np.number]).columns
+    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
+    # Interaction terms
+    if len(numeric_cols) >= 2:
+        suggestions.append("• **Interaction Terms:** Consider creating interaction terms between numerical features to capture combined effects.")
+    # Encoding categorical variables
+    if len(categorical_cols) > 0:
+        suggestions.append("• **One-Hot Encoding:** Apply one-hot encoding to categorical variables to convert them into numerical format.")
+        suggestions.append("• **Label Encoding:** For ordinal categorical variables, consider label encoding to maintain order information.")
+    # Handling skewness
+    for col in numeric_cols:
+        if data[col].skew() > 1 or data[col].skew() < -1:
+            suggestions.append(f"• **Log Transformation:** Apply log transformation to `{col}` to reduce skewness and stabilize variance.")
+    # Missing value imputation
+    for col in data.columns:
+        if data[col].isnull().sum() > 0:
+            suggestions.append(f"• **Imputation:** Consider imputing missing values in `{col}` using mean, median, or advanced imputation techniques.")
+    # Feature scaling
+    suggestions.append("• **Feature Scaling:** Apply feature scaling (Standardization or Normalization) to numerical features to ensure uniformity.")
+    return "\n".join(suggestions)
+@tool
+def predictive_analysis(data: pd.DataFrame, target: str) -> str:
+    """Perform predictive analytics by training a classification model."""
+    if data is None:
+        data = tool.agent.dataset
+    if target not in data.columns:
+        return f"Error: Target column `{target}` not found in the dataset."
+    # Handle categorical target
+    if data[target].dtype == 'object' or data[target].dtype.name == 'category':
+        data[target] = data[target].astype('category').cat.codes
+    # Drop rows with missing target
+    data = data.dropna(subset=[target])
+    # Separate features and target
+    X = data.drop(columns=[target])
+    y = data[target]
+    # Handle missing values (simple imputation)
+    X = X.fillna(X.median())
+    # Encode categorical variables
+    X = pd.get_dummies(X, drop_first=True)
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Train a Random Forest Classifier (as an example)
+    from sklearn.ensemble import RandomForestClassifier
+    clf = RandomForestClassifier(n_estimators=100, random_state=42)
+    clf.fit(X_train, y_train)
+    # Predictions
+    y_pred = clf.predict(X_test)
+    y_proba = clf.predict_proba(X_test)[:,1]
+    # Evaluation
+    report = classification_report(y_test, y_pred, output_dict=True)
+    report_df = pd.DataFrame(report).transpose()
+    # Confusion Matrix
+    cm = confusion_matrix(y_test, y_pred)
+    fig_cm = px.imshow(cm, text_auto=True, labels=dict(x="Predicted", y="Actual", color="Count"),
+                       x=["Negative", "Positive"], y=["Negative", "Positive"],
+                       title="Confusion Matrix")
+    # ROC Curve
+    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
+    roc_auc = auc(fpr, tpr)
+    fig_roc = go.Figure()
+    fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {roc_auc:.2f})'))
+    fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Random Guess', line=dict(dash='dash')))
+    fig_roc.update_layout(title='Receiver Operating Characteristic (ROC) Curve',
+                          xaxis_title='False Positive Rate',
+                          yaxis_title='True Positive Rate')
+    # Save models for potential future use
+    model_id = str(uuid.uuid4())
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.joblib') as tmp_model_file:
+        joblib.dump(clf, tmp_model_file.name)
+        # In a real-world scenario, you'd store this in a persistent storage
+    tool.agent.models[model_id] = clf  # Storing in agent's models dict
+    # Generate HTML for plots
+    cm_html = fig_cm.to_html(full_html=False)
+    roc_html = fig_roc.to_html(full_html=False)
+    # Generate report summary
+    summary = f"""
+    ### Predictive Analytics Report for Target: `{target}`
+    **Model Used:** Random Forest Classifier
+    **Classification Report:**
+    {report_df.to_markdown()}
+    **Confusion Matrix:**
+    {cm_html}
+    **ROC Curve:**
+    {roc_html}
+    **AUC Score:** {roc_auc:.2f}
+    **Model ID:** `{model_id}`
+    *You can use this Model ID to retrieve or update the model in future analyses.*
+    """
+    return summary
+# ------------------------------
+# Report Exporting Function
+# ------------------------------
+def export_report(content: str, filename: str):
+    """Export the given content as a PDF report."""
+    # Save content to a temporary HTML file
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp_file:
+        tmp_file.write(content.encode('utf-8'))
+        tmp_file_path = tmp_file.name
+    # Define output PDF path
+    pdf_path = f"{filename}.pdf"
+    # Convert HTML to PDF using pdfkit
+    try:
+        # Configure pdfkit options for HuggingFace Spaces environment
+        config = pdfkit.configuration()
+        pdfkit.from_file(tmp_file_path, pdf_path, configuration=config)
+        with open(pdf_path, "rb") as pdf_file:
+            PDFbyte = pdf_file.read()
+        # Provide download link
+        st.download_button(label="📥 Download Report as PDF",
+                           data=PDFbyte,
+                           file_name=pdf_path,
+                           mime='application/octet-stream')
+    except Exception as e:
+        st.error(f"⚠️ Error exporting report: {str(e)}")
+    finally:
+        os.remove(tmp_file_path)
+        if os.path.exists(pdf_path):
+            os.remove(pdf_path)
+# ------------------------------
+# Main Application Function
+# ------------------------------
+def main():
+    st.set_page_config(page_title="📊 Business Intelligence Assistant", layout="wide")
+    st.title("📊 **Business Intelligence Assistant**")
+    st.write("Upload your dataset and receive comprehensive analyses, interactive visualizations, and predictive insights.")
+    # Initialize session state
+    if 'data' not in st.session_state:
+        st.session_state['data'] = None
+    if 'agent' not in st.session_state:
+        st.session_state['agent'] = None
+    if 'report_content' not in st.session_state:
+        st.session_state['report_content'] = ""
+    # File Uploader
+    uploaded_file = st.file_uploader("📥 **Upload a CSV file**", type="csv")
+    try:
+        if uploaded_file is not None:
+            with st.spinner('🔄 Loading and processing your data...'):
+                # Load the dataset
+                data = pd.read_csv(uploaded_file)
+                st.session_state['data'] = data
+                # Initialize the agent with the dataset
+                st.session_state['agent'] = DataAnalysisAgent(
+                    dataset=data,
+                    tools=[analyze_basic_stats, generate_correlation_matrix,
+                           analyze_categorical_columns, suggest_features, predictive_analysis],
+                    model=GroqLLM(),
+                    additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn", "plotly"]
+                )
+                st.success(f"✅ Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns")
+                st.subheader("🔍 **Data Preview**")
+                st.dataframe(data.head())
+        if st.session_state['data'] is not None:
+            # Sidebar for Analysis Selection
+            st.sidebar.header("🛠️ **Select Analysis Type**")
+            analysis_type = st.sidebar.selectbox(
+                "Choose analysis type",
+                ["Basic Statistics", "Correlation Analysis", "Categorical Analysis",
+                 "Feature Engineering", "Predictive Analytics", "Custom Question"]
+            )
+            if analysis_type == "Basic Statistics":
+                with st.spinner('📈 Analyzing basic statistics...'):
+                    result = st.session_state['agent'].run(
+                        "Use the analyze_basic_stats tool to analyze this dataset and "
+                        "provide insights about the numerical distributions."
+                    )
+                    st.markdown(result, unsafe_allow_html=True)
+                    st.session_state['report_content'] += result + "\n\n"
+            elif analysis_type == "Correlation Analysis":
+                with st.spinner('📊 Generating correlation matrix...'):
+                    result = st.session_state['agent'].run(
+                        "Use the generate_correlation_matrix tool to analyze correlations "
+                        "and explain any strong relationships found."
+                    )
+                    st.components.v1.html(result, height=600)
+                    st.session_state['report_content'] += "### Correlation Analysis\n" + result + "\n\n"
+            elif analysis_type == "Categorical Analysis":
+                with st.spinner('📊 Analyzing categorical columns...'):
+                    result = st.session_state['agent'].run(
+                        "Use the analyze_categorical_columns tool to examine the "
+                        "categorical variables and explain the distributions."
+                    )
+                    st.markdown(result, unsafe_allow_html=True)
+                    st.session_state['report_content'] += result + "\n\n"
+            elif analysis_type == "Feature Engineering":
+                with st.spinner('🔧 Generating feature suggestions...'):
+                    result = st.session_state['agent'].run(
+                        "Use the suggest_features tool to recommend potential "
+                        "feature engineering steps for this dataset."
+                    )
+                    st.markdown(result, unsafe_allow_html=True)
+                    st.session_state['report_content'] += result + "\n\n"
+            elif analysis_type == "Predictive Analytics":
+                with st.form("Predictive Analytics Form"):
+                    st.write("🔮 **Predictive Analytics**")
+                    target = st.selectbox("Select the target variable for prediction:", options=st.session_state['data'].columns)
+                    submit = st.form_submit_button("🚀 Run Predictive Analysis")
+                if submit:
+                    with st.spinner('🚀 Performing predictive analysis...'):
+                        result = st.session_state['agent'].run(
+                            f"Use the predictive_analysis tool to build a classification model with `{target}` as the target variable."
+                        )
+                        st.markdown(result, unsafe_allow_html=True)
+                        st.session_state['report_content'] += result + "\n\n"
+                        export_report(result, "Predictive_Analysis_Report")
+            elif analysis_type == "Custom Question":
+                with st.expander("📝 **Ask a Custom Question**"):
+                    question = st.text_input("What would you like to know about your data?")
+                    if st.button("🔍 Get Answer"):
+                        if question:
+                            with st.spinner('🧠 Processing your question...'):
+                                result = st.session_state['agent'].run(question)
+                                st.markdown(result, unsafe_allow_html=True)
+                                st.session_state['report_content'] += f"### Custom Question: {question}\n{result}\n\n"
+                        else:
+                            st.warning("Please enter a question.")
+            # Option to Export Report
+            if st.session_state['report_content']:
+                st.sidebar.markdown("---")
+                if st.sidebar.button("📤 **Export Analysis Report**"):
+                    export_report(st.session_state['report_content'], "Business_Intelligence_Report")
+                    st.sidebar.success("✅ Report exported successfully!")
+    except Exception as e:
+        st.error(f"⚠️ An error occurred: {str(e)}")
+# ------------------------------
+# Application Entry Point
+# ------------------------------
+if __name__ == "__main__":
+    main()