|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
from langchain.tools import tool |
|
from langchain.agents import initialize_agent, AgentType |
|
from langchain.chat_models import ChatOpenAI |
|
from typing import Union, List, Dict, Optional |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import os |
|
import base64 |
|
import io |
|
|
|
|
|
os.environ["OPENAI_API_KEY"] = "your-openai-api-key" |
|
llm = ChatOpenAI(model="gpt-4", temperature=0.7) |
|
|
|
@tool |
|
def analyze_basic_stats(data: pd.DataFrame) -> str: |
|
"""Calculate basic statistical measures for numerical columns in the dataset. |
|
|
|
Args: |
|
data (pd.DataFrame): The dataset to analyze. It should contain at least one numerical column. |
|
|
|
Returns: |
|
str: A string containing formatted basic statistics for each numerical column, |
|
including mean, median, standard deviation, skewness, and missing value counts. |
|
""" |
|
stats = {} |
|
numeric_cols = data.select_dtypes(include=[np.number]).columns |
|
|
|
for col in numeric_cols: |
|
stats[col] = { |
|
'mean': float(data[col].mean()), |
|
'median': float(data[col].median()), |
|
'std': float(data[col].std()), |
|
'skew': float(data[col].skew()), |
|
'missing': int(data[col].isnull().sum()) |
|
} |
|
|
|
return str(stats) |
|
|
|
@tool |
|
def generate_correlation_matrix(data: pd.DataFrame) -> str: |
|
"""Generate a visual correlation matrix for numerical columns in the dataset. |
|
|
|
Args: |
|
data (pd.DataFrame): The dataset to analyze. It should contain at least two numerical columns. |
|
|
|
Returns: |
|
str: A base64 encoded string representing the correlation matrix plot image. |
|
""" |
|
numeric_data = data.select_dtypes(include=[np.number]) |
|
|
|
plt.figure(figsize=(10, 8)) |
|
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm') |
|
plt.title('Correlation Matrix') |
|
|
|
buf = io.BytesIO() |
|
plt.savefig(buf, format='png') |
|
plt.close() |
|
return base64.b64encode(buf.getvalue()).decode() |
|
|
|
@tool |
|
def analyze_categorical_columns(data: pd.DataFrame) -> str: |
|
"""Analyze categorical columns in the dataset for distribution and frequencies. |
|
|
|
Args: |
|
data (pd.DataFrame): The dataset to analyze. It should contain at least one categorical column. |
|
|
|
Returns: |
|
str: A string containing formatted analysis results for each categorical column, |
|
including unique value counts, top categories, and missing value counts. |
|
""" |
|
categorical_cols = data.select_dtypes(include=['object', 'category']).columns |
|
analysis = {} |
|
|
|
for col in categorical_cols: |
|
analysis[col] = { |
|
'unique_values': int(data[col].nunique()), |
|
'top_categories': data[col].value_counts().head(5).to_dict(), |
|
'missing': int(data[col].isnull().sum()) |
|
} |
|
|
|
return str(analysis) |
|
|
|
@tool |
|
def suggest_features(data: pd.DataFrame) -> str: |
|
"""Suggest potential feature engineering steps based on data characteristics. |
|
|
|
Args: |
|
data (pd.DataFrame): The dataset to analyze. It can contain both numerical and categorical columns. |
|
|
|
Returns: |
|
str: A string containing suggestions for feature engineering based on |
|
the characteristics of the input data. |
|
""" |
|
suggestions = [] |
|
numeric_cols = data.select_dtypes(include=[np.number]).columns |
|
categorical_cols = data.select_dtypes(include=['object', 'category']).columns |
|
|
|
if len(numeric_cols) >= 2: |
|
suggestions.append("Consider creating interaction terms between numerical features") |
|
|
|
if len(categorical_cols) > 0: |
|
suggestions.append("Consider one-hot encoding for categorical variables") |
|
|
|
for col in numeric_cols: |
|
if data[col].skew() > 1 or data[col].skew() < -1: |
|
suggestions.append(f"Consider log transformation for {col} due to skewness") |
|
|
|
return '\n'.join(suggestions) |
|
|
|
def main(): |
|
st.title("Data Analysis Assistant") |
|
st.write("Upload your dataset and get automated analysis with natural language interaction.") |
|
|
|
|
|
if 'data' not in st.session_state: |
|
st.session_state['data'] = None |
|
if 'agent' not in st.session_state: |
|
st.session_state['agent'] = None |
|
|
|
|
|
uploaded_file = st.file_uploader("Drag and drop a CSV file here", type="csv") |
|
|
|
try: |
|
if uploaded_file is not None: |
|
with st.spinner('Loading and processing your data...'): |
|
|
|
data = pd.read_csv(uploaded_file) |
|
st.session_state['data'] = data |
|
|
|
|
|
tools = [analyze_basic_stats, generate_correlation_matrix, |
|
analyze_categorical_columns, suggest_features] |
|
st.session_state['agent'] = initialize_agent( |
|
tools=tools, |
|
llm=llm, |
|
agent=AgentType.OPENAI_FUNCTIONS, |
|
verbose=True |
|
) |
|
|
|
st.success(f'Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns') |
|
st.subheader("Data Preview") |
|
st.dataframe(data.head()) |
|
|
|
if st.session_state['data'] is not None: |
|
analysis_type = st.selectbox( |
|
"Choose analysis type", |
|
["Basic Statistics", "Correlation Analysis", "Categorical Analysis", |
|
"Feature Engineering", "Custom Question"] |
|
) |
|
|
|
if analysis_type == "Basic Statistics": |
|
with st.spinner('Analyzing basic statistics...'): |
|
result = st.session_state['agent'].run( |
|
f"Analyze the dataset and provide basic statistics: {st.session_state['data']}" |
|
) |
|
st.write(result) |
|
|
|
elif analysis_type == "Correlation Analysis": |
|
with st.spinner('Generating correlation matrix...'): |
|
result = st.session_state['agent'].run( |
|
f"Generate a correlation matrix for the dataset: {st.session_state['data']}" |
|
) |
|
if isinstance(result, str) and result.startswith('data:image') or ',' in result: |
|
st.image(f"data:image/png;base64,{result.split(',')[-1]}") |
|
else: |
|
st.write(result) |
|
|
|
elif analysis_type == "Categorical Analysis": |
|
with st.spinner('Analyzing categorical columns...'): |
|
result = st.session_state['agent'].run( |
|
f"Analyze categorical columns in the dataset: {st.session_state['data']}" |
|
) |
|
st.write(result) |
|
|
|
elif analysis_type == "Feature Engineering": |
|
with st.spinner('Generating feature suggestions...'): |
|
result = st.session_state['agent'].run( |
|
f"Suggest feature engineering steps for the dataset: {st.session_state['data']}" |
|
) |
|
st.write(result) |
|
|
|
elif analysis_type == "Custom Question": |
|
question = st.text_input("What would you like to know about your data?") |
|
if question: |
|
with st.spinner('Analyzing...'): |
|
result = st.session_state['agent'].run(question) |
|
st.write(result) |
|
|
|
except Exception as e: |
|
st.error(f"An error occurred: {str(e)}") |
|
|
|
if __name__ == "__main__": |
|
main() |