" for col, stats in analysis.items(): html_content += f"

" html_content += f"

{col}

" html_content += f"

Unique Values: {stats['unique_values']}

" html_content += f"

Missing Values: {stats['missing']}

" # Add bar chart for top categories if stats['top_categories']: categories = list(stats['top_categories'].keys()) values = list(stats['top_categories'].values()) fig = go.Figure() fig.add_trace(go.Bar( x=categories, y=values, marker_color='#1a73e8', hoverinfo='x+y' )) fig.update_layout( title=f"Top Categories for {col}", xaxis_title="Category", yaxis_title="Count", font=dict(family="Inter, sans-serif"), height=350, margin=dict(l=40, r=40, t=60, b=80), xaxis=dict(tickangle=-45) ) html_content += fig.to_html(full_html=False, include_plotlyjs='cdn') html_content += "

" html_content += "

Data Analysis Assistant

', unsafe_allow_html=True) st.markdown('

Upload Dataset

', unsafe_allow_html=True) # File uploader with custom styling uploaded_file = st.file_uploader("", type="csv") if uploaded_file is not None: try: with st.spinner('Processing dataset...'): # Load the dataset data = pd.read_csv(uploaded_file) st.session_state['data'] = data # Initialize the agent with the dataset st.session_state['agent'] = DataAnalysisAgent( dataset=data, tools=[analyze_basic_stats, generate_correlation_matrix, analyze_categorical_columns, suggest_features, visualize_distributions], model=GroqLLM(), additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn", "plotly"] ) # Display dataset statistics st.markdown("""

✓ Dataset loaded successfully

""", unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: st.markdown(f"""

{data.shape[0]:,}

Rows

""", unsafe_allow_html=True) with col2: st.markdown(f"""

{data.shape[1]}

Columns

""", unsafe_allow_html=True) except Exception as e: st.error(f"Error: {str(e)}") # Analysis type selection if st.session_state['data'] is not None: st.markdown('

Analysis Tools

', unsafe_allow_html=True) analysis_type = st.selectbox( "Select analysis type", ["Data Overview", "Basic Statistics", "Feature Correlations", "Categorical Analysis", "Feature Engineering", "Data Distributions", "Ask Your Own Question"] ) st.markdown('

', unsafe_allow_html=True) st.markdown('

Data Preview

', unsafe_allow_html=True) # Add tabs for different data views data_tabs = st.tabs(["Data Sample", "Column Info", "Missing Values"]) with data_tabs[0]: st.markdown('

', unsafe_allow_html=True) st.dataframe(st.session_state['data'].head(10), use_container_width=True) st.markdown('

', unsafe_allow_html=True) with data_tabs[1]: col1, col2, col3 = st.columns(3) with col1: st.markdown("**Column Names**") st.write(st.session_state['data'].columns.tolist()) with col2: st.markdown("**Data Types**") for col, dtype in st.session_state['data'].dtypes.items(): st.write(f"{col}: {dtype}") with col3: st.markdown("**Non-Null Count**") for col, count in st.session_state['data'].count().items(): st.write(f"{col}: {count}/{len(st.session_state['data'])}") with data_tabs[2]: missing_data = st.session_state['data'].isnull().sum() if missing_data.sum() > 0: missing_df = pd.DataFrame({ 'Column': missing_data.index, 'Missing Values': missing_data.values, 'Percentage': round(missing_data.values / len(st.session_state['data']) * 100, 2) }) missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False) st.dataframe(missing_df, use_container_width=True) # Add a visualization of missing values fig = px.bar( missing_df, x='Column', y='Percentage', color='Percentage', color_continuous_scale='Blues', title='Missing Values by Column (%)' ) fig.update_layout( xaxis_title='', yaxis_title='Missing Values (%)', height=400 ) st.plotly_chart(fig, use_container_width=True) else: st.success("No missing values in the dataset!") st.markdown('

', unsafe_allow_html=True) st.markdown(f'

{analysis_type} Results

', unsafe_allow_html=True) if analysis_type == "Data Overview": col1, col2 = st.columns(2) with col1: st.markdown("### Dataset Summary") st.dataframe(st.session_state['data'].describe(), use_container_width=True) with col2: st.markdown("### Data Profile") numeric_count = len(st.session_state['data'].select_dtypes(include=[np.number]).columns) categorical_count = len(st.session_state['data'].select_dtypes(include=['object', 'category']).columns) # Create a pie chart for data types fig = px.pie( values=[numeric_count, categorical_count], names=['Numeric', 'Categorical'], color_discrete_sequence=['#1a73e8', '#34a853'], hole=0.4 ) fig.update_layout( title='Column Types', font=dict(family="Inter, sans-serif"), legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5) ) st.plotly_chart(fig, use_container_width=True) elif analysis_type == "Basic Statistics": with st.spinner('Analyzing basic statistics...'): result = st.session_state['agent'].run( "Use the analyze_basic_stats tool to analyze this dataset and " "provide insights about the numerical distributions." ) # Parse the string representation of the dictionary try: # Remove the literal 'str' prefix if present if result.startswith("str("): result = result[4:-1] # Convert string to dict import ast stats_dict = ast.literal_eval(result) # Display results in a more visual format for col, stats in stats_dict.items(): st.markdown(f"### {col}") # Create metrics in columns col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Mean", f"{stats['mean']:.2f}") with col2: st.metric("Median", f"{stats['median']:.2f}") with col3: st.metric("Std Dev", f"{stats['std']:.2f}") with col4: st.metric("Skewness", f"{stats['skew']:.2f}") # Create a boxplot for this column fig = px.box( st.session_state['data'], y=col, points="all", color_discrete_sequence=['#1a73e8'], title=f"Distribution of {col}" ) fig.update_layout( height=300, margin=dict(t=40, b=20, l=40, r=20), font=dict(family="Inter, sans-serif") ) st.plotly_chart(fig, use_container_width=True) st.markdown("---") except Exception as e: st.write(result) elif analysis_type == "Feature Correlations": with st.spinner('Analyzing feature correlations...'): result = st.session_state['agent'].run( "Use the generate_correlation_matrix tool to analyze correlations " "and explain any strong relationships found." ) # If the result is HTML, display it directly if isinstance(result, str) and ("', unsafe_allow_html=True) else: # Display welcome message for users who haven't uploaded data yet st.markdown("""

Welcome to Data Analysis Assistant

Upload a CSV file to get started with instant insights and intelligent analysis. Our AI-powered assistant will help you understand your data like never before.

📊

Automatic Visualizations

Get instant charts and plots revealing insights in your data

🧠

AI-Powered Analysis

Advanced algorithms find patterns and correlations automatically

💡

Smart Recommendations

Get suggestions for feature engineering and data preparation

""", unsafe_allow_html=True) # Import for subplot creation from plotly.subplots import make_subplots if __name__ == "__main__": # Check if Groq API key is available if not os.environ.get("GROQ_API_KEY"): st.error(""" GROQ API key not found! Please set your GROQ_API_KEY environment variable. You can get an API key from https://console.groq.com/ """) else: main()

{col}

Feature Engineering Suggestions

Data Analysis Assistant

Welcome to Data Analysis Assistant

Automatic Visualizations

AI-Powered Analysis

Smart Recommendations