mgbam commited on
Commit
2cebaf2
·
verified ·
1 Parent(s): 33c4308

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -118
app.py CHANGED
@@ -12,60 +12,76 @@ from typing import Dict, List, Optional
12
  from langchain.tools import tool
13
  from langchain.agents import initialize_agent, AgentType
14
  from scipy.stats import ttest_ind, f_oneway
 
 
 
15
 
16
  # Initialize Groq Client
17
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
18
 
 
19
  class ResearchInput(BaseModel):
20
- """Base schema for research tool inputs"""
21
- data_key: str = Field(..., description="Session state key containing DataFrame")
22
- columns: Optional[List[str]] = Field(None, description="List of columns to analyze")
 
23
 
24
  class TemporalAnalysisInput(ResearchInput):
25
- """Schema for temporal analysis"""
26
- time_col: str = Field(..., description="Name of timestamp column")
27
- value_col: str = Field(..., description="Name of value column to analyze")
 
28
 
29
  class HypothesisInput(ResearchInput):
30
- """Schema for hypothesis testing"""
31
- group_col: str = Field(..., description="Categorical column defining groups")
32
- value_col: str = Field(..., description="Numerical column to compare")
 
33
 
34
  class GroqResearcher:
35
- """Advanced AI Research Engine using Groq"""
 
 
 
 
36
  def __init__(self, model_name="mixtral-8x7b-32768"):
37
  self.model_name = model_name
38
- self.system_template = """You are a senior data scientist at a research institution.
39
- Analyze this dataset with rigorous statistical methods and provide academic-quality insights:
40
- {dataset_info}
41
 
42
- User Question: {query}
43
-
44
- Required Format:
45
- - Executive Summary (1 paragraph)
46
- - Methodology (bullet points)
47
- - Key Findings (numbered list)
48
- - Limitations
49
- - Recommended Next Steps"""
 
 
 
 
 
 
 
 
50
 
51
  def research(self, query: str, data: pd.DataFrame) -> str:
52
- """Conduct academic-level analysis using Groq"""
53
  try:
54
- dataset_info = f"""
55
- Dataset Dimensions: {data.shape}
56
- Variables: {', '.join(data.columns)}
57
- Temporal Coverage: {data.select_dtypes(include='datetime').columns.tolist()}
58
- Missing Values: {data.isnull().sum().to_dict()}
59
- """
60
-
61
- prompt = PromptTemplate.from_template(self.system_template).format(
62
- dataset_info=dataset_info,
63
- query=query
64
- )
65
-
66
  completion = client.chat.completions.create(
67
  messages=[
68
- {"role": "system", "content": "You are a research AI assistant"},
69
  {"role": "user", "content": prompt}
70
  ],
71
  model=self.model_name,
@@ -73,20 +89,22 @@ class GroqResearcher:
73
  max_tokens=4096,
74
  stream=False
75
  )
76
-
77
  return completion.choices[0].message.content
78
-
79
  except Exception as e:
80
- return f"Research Error: {str(e)}"
 
81
 
82
  @tool(args_schema=ResearchInput)
83
  def advanced_eda(data_key: str) -> Dict:
84
- """Comprehensive Exploratory Data Analysis with Statistical Profiling"""
 
 
 
85
  try:
86
  data = st.session_state[data_key]
87
  analysis = {
88
  "dimensionality": {
89
- "rows": len(data),
90
  "columns": list(data.columns),
91
  "memory_usage": f"{data.memory_usage().sum() / 1e6:.2f} MB"
92
  },
@@ -94,112 +112,147 @@ def advanced_eda(data_key: str) -> Dict:
94
  "temporal_analysis": {
95
  "date_ranges": {
96
  col: {
97
- "min": data[col].min(),
98
- "max": data[col].max()
99
  } for col in data.select_dtypes(include='datetime').columns
100
  }
101
  },
102
  "data_quality": {
103
  "missing_values": data.isnull().sum().to_dict(),
104
- "duplicates": data.duplicated().sum(),
105
  "cardinality": {
106
- col: data[col].nunique() for col in data.columns
107
  }
108
  }
109
  }
110
  return analysis
111
  except Exception as e:
112
- return {"error": f"EDA Failed: {str(e)}"}
113
 
114
  @tool(args_schema=ResearchInput)
115
  def visualize_distributions(data_key: str, columns: List[str]) -> str:
116
- """Generate publication-quality distribution visualizations"""
 
 
 
117
  try:
118
  data = st.session_state[data_key]
119
- plt.figure(figsize=(12, 6))
120
  for i, col in enumerate(columns, 1):
121
  plt.subplot(1, len(columns), i)
122
- sns.histplot(data[col], kde=True, stat="density")
123
- plt.title(f'Distribution of {col}', fontsize=10)
124
- plt.xticks(fontsize=8)
125
- plt.yticks(fontsize=8)
126
- plt.tight_layout()
 
 
 
 
127
 
128
  buf = io.BytesIO()
129
  plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
130
  plt.close()
131
  return base64.b64encode(buf.getvalue()).decode()
132
  except Exception as e:
133
- return f"Visualization Error: {str(e)}"
 
134
 
135
  @tool(args_schema=TemporalAnalysisInput)
136
  def temporal_analysis(data_key: str, time_col: str, value_col: str) -> Dict:
137
- """Time Series Decomposition and Trend Analysis"""
 
 
 
138
  try:
139
  data = st.session_state[data_key]
140
- ts_data = data.set_index(pd.to_datetime(data[time_col]))[value_col]
141
-
142
- decomposition = seasonal_decompose(ts_data, period=365)
143
-
144
- plt.figure(figsize=(12, 8))
 
 
 
145
  decomposition.plot()
146
  plt.tight_layout()
147
-
148
  buf = io.BytesIO()
149
- plt.savefig(buf, format='png')
150
  plt.close()
151
  plot_data = base64.b64encode(buf.getvalue()).decode()
152
-
 
 
 
153
  return {
154
  "trend_statistics": {
155
- "stationarity": adfuller(ts_data)[1],
156
- "seasonality_strength": max(decomposition.seasonal)
 
157
  },
158
- "visualization": plot_data
 
 
 
 
 
159
  }
160
  except Exception as e:
161
- return {"error": f"Temporal Analysis Failed: {str(e)}"}
162
 
163
  @tool(args_schema=HypothesisInput)
164
  def hypothesis_testing(data_key: str, group_col: str, value_col: str) -> Dict:
165
- """Statistical Hypothesis Testing with Automated Assumption Checking"""
 
 
 
166
  try:
167
  data = st.session_state[data_key]
168
  groups = data[group_col].unique()
169
 
170
  if len(groups) < 2:
171
- return {"error": "Insufficient groups for comparison"}
 
 
 
 
 
172
 
173
  if len(groups) == 2:
174
- group_data = [data[data[group_col] == g][value_col] for g in groups]
175
  stat, p = ttest_ind(*group_data)
176
  test_type = "Independent t-test"
177
  else:
178
- group_data = [data[data[group_col] == g][value_col] for g in groups]
179
  stat, p = f_oneway(*group_data)
180
  test_type = "ANOVA"
181
 
 
 
 
 
 
 
 
 
 
182
  return {
183
  "test_type": test_type,
184
- "test_statistic": stat,
185
- "p_value": p,
186
- "effect_size": {
187
- "cohens_d": abs(group_data[0].mean() - group_data[1].mean())/np.sqrt(
188
- (group_data[0].var() + group_data[1].var())/2
189
- ) if len(groups) == 2 else None
190
- },
191
- "interpretation": interpret_p_value(p)
192
  }
193
  except Exception as e:
194
  return {"error": f"Hypothesis Testing Failed: {str(e)}"}
195
 
196
  def interpret_p_value(p: float) -> str:
197
- """Scientific interpretation of p-values"""
198
- if p < 0.001: return "Very strong evidence against H0"
199
- elif p < 0.01: return "Strong evidence against H0"
200
- elif p < 0.05: return "Evidence against H0"
201
- elif p < 0.1: return "Weak evidence against H0"
202
- else: return "No significant evidence against H0"
203
 
204
  def main():
205
  st.set_page_config(page_title="AI Research Lab", layout="wide")
@@ -217,9 +270,12 @@ def main():
217
  uploaded_file = st.file_uploader("Upload research dataset", type=["csv", "parquet"])
218
  if uploaded_file:
219
  with st.spinner("Initializing dataset..."):
220
- st.session_state.data = pd.read_csv(uploaded_file)
221
- st.success(f"Loaded {len(st.session_state.data):,} research observations")
222
-
 
 
 
223
  # Main research interface
224
  if st.session_state.data is not None:
225
  col1, col2 = st.columns([1, 3])
@@ -230,10 +286,10 @@ def main():
230
  "Variables": list(st.session_state.data.columns),
231
  "Time Range": {
232
  col: {
233
- "min": st.session_state.data[col].min(),
234
- "max": st.session_state.data[col].max()
235
  } for col in st.session_state.data.select_dtypes(include='datetime').columns
236
- },
237
  "Size": f"{st.session_state.data.memory_usage().sum() / 1e6:.2f} MB"
238
  })
239
 
@@ -254,35 +310,42 @@ def main():
254
  st.json(eda_result)
255
 
256
  elif analysis_type == "Temporal Pattern Analysis":
257
- time_col = st.selectbox("Temporal Variable",
258
- st.session_state.data.select_dtypes(include='datetime').columns)
259
- value_col = st.selectbox("Analysis Variable",
260
- st.session_state.data.select_dtypes(include=np.number).columns)
261
-
262
- if time_col and value_col:
263
- result = temporal_analysis.invoke({
264
- "data_key": "data",
265
- "time_col": time_col,
266
- "value_col": value_col
267
- })
268
- if "visualization" in result:
269
- st.image(f"data:image/png;base64,{result['visualization']}")
270
- st.json(result)
 
 
 
 
271
 
272
  elif analysis_type == "Comparative Statistics":
273
- group_col = st.selectbox("Grouping Variable",
274
- st.session_state.data.select_dtypes(include='category').columns)
275
- value_col = st.selectbox("Metric Variable",
276
- st.session_state.data.select_dtypes(include=np.number).columns)
277
-
278
- if group_col and value_col:
279
- result = hypothesis_testing.invoke({
280
- "data_key": "data",
281
- "group_col": group_col,
282
- "value_col": value_col
283
- })
284
- st.subheader("Statistical Test Results")
285
- st.json(result)
 
 
 
286
 
287
  elif analysis_type == "Distribution Analysis":
288
  num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
@@ -292,7 +355,8 @@ def main():
292
  "data_key": "data",
293
  "columns": selected_cols
294
  })
295
- st.image(f"data:image/png;base64,{img_data}")
 
296
 
297
  with research_tab:
298
  research_query = st.text_area("Enter Research Question:", height=150,
 
12
  from langchain.tools import tool
13
  from langchain.agents import initialize_agent, AgentType
14
  from scipy.stats import ttest_ind, f_oneway
15
+ from statsmodels.tsa.seasonal import seasonal_decompose
16
+ from statsmodels.tsa.stattools import adfuller
17
+ from jinja2 import Template
18
 
19
  # Initialize Groq Client
20
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
21
 
22
+
23
  class ResearchInput(BaseModel):
24
+ """Base schema for research tool inputs, ensuring type and description integrity."""
25
+ data_key: str = Field(..., description="Session state key containing the DataFrame.")
26
+ columns: Optional[List[str]] = Field(None, description="List of column names to analyze.")
27
+
28
 
29
  class TemporalAnalysisInput(ResearchInput):
30
+ """Schema for temporal analysis inputs, focusing on specific time-series requirements."""
31
+ time_col: str = Field(..., description="Name of the column containing timestamp data.")
32
+ value_col: str = Field(..., description="Name of the column containing numerical values to analyze.")
33
+
34
 
35
  class HypothesisInput(ResearchInput):
36
+ """Schema for hypothesis testing, demanding group and value specification for statistical rigor."""
37
+ group_col: str = Field(..., description="Categorical column defining the groups for comparison.")
38
+ value_col: str = Field(..., description="Numerical column for comparing means across groups.")
39
+
40
 
41
  class GroqResearcher:
42
+ """
43
+ A sophisticated AI research engine powered by Groq, designed for rigorous academic-style analysis.
44
+ This class handles complex data queries and delivers structured research outputs.
45
+ """
46
+
47
  def __init__(self, model_name="mixtral-8x7b-32768"):
48
  self.model_name = model_name
49
+ self.system_template = """
50
+ You are a senior data scientist at a prestigious research institution. Your analysis must
51
+ adhere to rigorous scientific standards. Consider the dataset properties and the user's query.
52
 
53
+ Dataset Context:
54
+ - Dimensions: {{ dataset_shape }}
55
+ - Variables: {{ dataset_variables }}
56
+ - Temporal Coverage: {{ temporal_coverage }}
57
+ - Missing Value Counts: {{ missing_values }}
58
+
59
+ User Inquiry: {{ query }}
60
+
61
+ Response Structure (Critical for all analyses):
62
+ 1. **Executive Summary:** Provide a 1-2 paragraph overview of the findings, contextualized within the dataset's characteristics.
63
+ 2. **Methodology:** Detail the exact analysis techniques used, including statistical tests or model types, and their justification.
64
+ 3. **Key Findings:** Present the most significant observations and statistical results (p-values, effect sizes) with proper interpretation.
65
+ 4. **Limitations:** Acknowledge and describe the constraints of the dataset or analytical methods that might affect the results' interpretation or generalizability.
66
+ 5. **Recommended Next Steps:** Suggest future studies, experiments, or analyses that could extend the current investigation and address the noted limitations.
67
+
68
+ """
69
 
70
  def research(self, query: str, data: pd.DataFrame) -> str:
71
+ """Executes in-depth research using the Groq API to produce academic-quality analyses."""
72
  try:
73
+ dataset_info = {
74
+ "dataset_shape": str(data.shape),
75
+ "dataset_variables": ", ".join(data.columns),
76
+ "temporal_coverage": str(data.select_dtypes(include='datetime').columns.tolist()),
77
+ "missing_values": str(data.isnull().sum().to_dict()),
78
+ }
79
+
80
+ prompt = Template(self.system_template).render(**dataset_info, query=query)
81
+
 
 
 
82
  completion = client.chat.completions.create(
83
  messages=[
84
+ {"role": "system", "content": "You are a research AI assistant."},
85
  {"role": "user", "content": prompt}
86
  ],
87
  model=self.model_name,
 
89
  max_tokens=4096,
90
  stream=False
91
  )
 
92
  return completion.choices[0].message.content
 
93
  except Exception as e:
94
+ return f"Research Error Encountered: {str(e)}"
95
+
96
 
97
  @tool(args_schema=ResearchInput)
98
  def advanced_eda(data_key: str) -> Dict:
99
+ """
100
+ Performs a comprehensive Exploratory Data Analysis, including statistical profiling,
101
+ temporal analysis of datetime columns, and detailed quality checks.
102
+ """
103
  try:
104
  data = st.session_state[data_key]
105
  analysis = {
106
  "dimensionality": {
107
+ "rows": int(len(data)), # Ensure rows are an integer
108
  "columns": list(data.columns),
109
  "memory_usage": f"{data.memory_usage().sum() / 1e6:.2f} MB"
110
  },
 
112
  "temporal_analysis": {
113
  "date_ranges": {
114
  col: {
115
+ "min": str(data[col].min()), # Ensure date is a string
116
+ "max": str(data[col].max()) # Ensure date is a string
117
  } for col in data.select_dtypes(include='datetime').columns
118
  }
119
  },
120
  "data_quality": {
121
  "missing_values": data.isnull().sum().to_dict(),
122
+ "duplicates": int(data.duplicated().sum()), # Ensure duplicates are an integer
123
  "cardinality": {
124
+ col: int(data[col].nunique()) for col in data.columns # Ensure cardinality is integer
125
  }
126
  }
127
  }
128
  return analysis
129
  except Exception as e:
130
+ return {"error": f"Advanced EDA Failed: {str(e)}"}
131
 
132
  @tool(args_schema=ResearchInput)
133
  def visualize_distributions(data_key: str, columns: List[str]) -> str:
134
+ """
135
+ Generates high-quality, publication-ready distribution visualizations (histograms with KDE)
136
+ for selected numerical columns, and returns the image as a base64 encoded string.
137
+ """
138
  try:
139
  data = st.session_state[data_key]
140
+ plt.figure(figsize=(15, 7)) # Adjusted figure size for better readability
141
  for i, col in enumerate(columns, 1):
142
  plt.subplot(1, len(columns), i)
143
+ sns.histplot(data[col], kde=True, stat="density", color=sns.color_palette()[i % len(sns.color_palette())])
144
+ plt.title(f'Distribution of {col}', fontsize=14, fontweight='bold') # Enhanced title
145
+ plt.xlabel(col, fontsize=12)
146
+ plt.ylabel('Density', fontsize=12)
147
+ plt.xticks(fontsize=10)
148
+ plt.yticks(fontsize=10)
149
+ plt.grid(axis='y', linestyle='--')
150
+ sns.despine(top=True, right=True) # Improved styling
151
+ plt.tight_layout(pad=2) # Added padding for tight layout
152
 
153
  buf = io.BytesIO()
154
  plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
155
  plt.close()
156
  return base64.b64encode(buf.getvalue()).decode()
157
  except Exception as e:
158
+ return f"Distribution Visualization Error: {str(e)}"
159
+
160
 
161
  @tool(args_schema=TemporalAnalysisInput)
162
  def temporal_analysis(data_key: str, time_col: str, value_col: str) -> Dict:
163
+ """
164
+ Performs a sophisticated time series analysis, including decomposition and trend assessment,
165
+ providing both statistical insights and a visual representation.
166
+ """
167
  try:
168
  data = st.session_state[data_key]
169
+ ts_data = data.set_index(pd.to_datetime(data[time_col]))[value_col].dropna() # Handle NaNs
170
+
171
+ if ts_data.empty:
172
+ return {"error": "No valid time series data found for analysis after NaN removal."}
173
+
174
+ decomposition = seasonal_decompose(ts_data, model='additive', period=min(len(ts_data), 365) if len(ts_data) > 10 else 1)
175
+
176
+ plt.figure(figsize=(16, 10))
177
  decomposition.plot()
178
  plt.tight_layout()
179
+
180
  buf = io.BytesIO()
181
+ plt.savefig(buf, format='png', dpi=300) # Increased dpi for higher resolution
182
  plt.close()
183
  plot_data = base64.b64encode(buf.getvalue()).decode()
184
+
185
+ adf_result = adfuller(ts_data)
186
+ stationarity_p_value = adf_result[1]
187
+
188
  return {
189
  "trend_statistics": {
190
+ "stationarity": stationarity_p_value,
191
+ "stationarity_interpretation": interpret_p_value(stationarity_p_value),
192
+ "seasonality_strength": max(decomposition.seasonal) if hasattr(decomposition, 'seasonal') else None
193
  },
194
+ "visualization": plot_data,
195
+ "decomposition_data": {
196
+ "trend": decomposition.trend.dropna().to_dict() if hasattr(decomposition, 'trend') else None,
197
+ "seasonal": decomposition.seasonal.dropna().to_dict() if hasattr(decomposition, 'seasonal') else None,
198
+ "residual": decomposition.resid.dropna().to_dict() if hasattr(decomposition, 'resid') else None,
199
+ }
200
  }
201
  except Exception as e:
202
+ return {"error": f"Temporal Analysis Failure: {str(e)}"}
203
 
204
  @tool(args_schema=HypothesisInput)
205
  def hypothesis_testing(data_key: str, group_col: str, value_col: str) -> Dict:
206
+ """
207
+ Conducts statistical hypothesis testing, providing detailed test results, effect size measures,
208
+ and interpretations for both t-tests and ANOVAs.
209
+ """
210
  try:
211
  data = st.session_state[data_key]
212
  groups = data[group_col].unique()
213
 
214
  if len(groups) < 2:
215
+ return {"error": "Insufficient groups for comparison. Must have at least two groups."}
216
+
217
+ group_data = [data[data[group_col] == g][value_col].dropna() for g in groups]
218
+
219
+ if any(len(group) < 2 for group in group_data):
220
+ return {"error": "Each group must have at least two data points for testing."}
221
 
222
  if len(groups) == 2:
 
223
  stat, p = ttest_ind(*group_data)
224
  test_type = "Independent t-test"
225
  else:
 
226
  stat, p = f_oneway(*group_data)
227
  test_type = "ANOVA"
228
 
229
+ effect_size = None
230
+ if len(groups) == 2:
231
+ pooled_variance = np.sqrt((group_data[0].var() + group_data[1].var()) / 2)
232
+ if pooled_variance != 0:
233
+ cohens_d = abs(group_data[0].mean() - group_data[1].mean()) / pooled_variance
234
+ effect_size = {"cohens_d": cohens_d}
235
+ else:
236
+ effect_size = {"cohens_d": None, "error": "Cannot compute effect size due to zero pooled variance."}
237
+
238
  return {
239
  "test_type": test_type,
240
+ "test_statistic": float(stat), # Ensure stat is a float
241
+ "p_value": float(p), # Ensure p_value is a float
242
+ "effect_size": effect_size,
243
+ "interpretation": interpret_p_value(p),
244
+ "group_means": {g: float(data[data[group_col] == g][value_col].mean()) for g in groups} # Group Means
 
 
 
245
  }
246
  except Exception as e:
247
  return {"error": f"Hypothesis Testing Failed: {str(e)}"}
248
 
249
  def interpret_p_value(p: float) -> str:
250
+ """Provides nuanced interpretations of p-values, including qualitative descriptors."""
251
+ if p < 0.001: return "Highly significant evidence against the null hypothesis (p < 0.001)."
252
+ elif p < 0.01: return "Strong evidence against the null hypothesis (0.001 ≤ p < 0.01)."
253
+ elif p < 0.05: return "Moderate evidence against the null hypothesis (0.01 ≤ p < 0.05)."
254
+ elif p < 0.1: return "Weak evidence against the null hypothesis (0.05 ≤ p < 0.1)."
255
+ else: return "No significant evidence against the null hypothesis (p ≥ 0.1)."
256
 
257
  def main():
258
  st.set_page_config(page_title="AI Research Lab", layout="wide")
 
270
  uploaded_file = st.file_uploader("Upload research dataset", type=["csv", "parquet"])
271
  if uploaded_file:
272
  with st.spinner("Initializing dataset..."):
273
+ try:
274
+ st.session_state.data = pd.read_csv(uploaded_file)
275
+ st.success(f"Loaded {len(st.session_state.data):,} research observations")
276
+ except Exception as e:
277
+ st.error(f"Error loading the dataset. Please ensure it's a valid CSV or Parquet format. Error details: {e}")
278
+
279
  # Main research interface
280
  if st.session_state.data is not None:
281
  col1, col2 = st.columns([1, 3])
 
286
  "Variables": list(st.session_state.data.columns),
287
  "Time Range": {
288
  col: {
289
+ "min": str(st.session_state.data[col].min()),
290
+ "max": str(st.session_state.data[col].max())
291
  } for col in st.session_state.data.select_dtypes(include='datetime').columns
292
+ } if st.session_state.data.select_dtypes(include='datetime').columns.tolist() else "No Temporal Data",
293
  "Size": f"{st.session_state.data.memory_usage().sum() / 1e6:.2f} MB"
294
  })
295
 
 
310
  st.json(eda_result)
311
 
312
  elif analysis_type == "Temporal Pattern Analysis":
313
+ time_cols = st.session_state.data.select_dtypes(include='datetime').columns.tolist()
314
+ if not time_cols:
315
+ st.warning("No datetime columns detected. Please ensure you have a datetime column for this analysis.")
316
+ else:
317
+ time_col = st.selectbox("Temporal Variable", time_cols)
318
+ value_col = st.selectbox("Analysis Variable",
319
+ st.session_state.data.select_dtypes(include=np.number).columns)
320
+
321
+ if time_col and value_col:
322
+ result = temporal_analysis.invoke({
323
+ "data_key": "data",
324
+ "time_col": time_col,
325
+ "value_col": value_col
326
+ })
327
+ if "visualization" in result:
328
+ st.image(f"data:image/png;base64,{result['visualization']}",
329
+ use_column_width=True)
330
+ st.json(result)
331
 
332
  elif analysis_type == "Comparative Statistics":
333
+ cat_cols = st.session_state.data.select_dtypes(include='category').columns.tolist() + st.session_state.data.select_dtypes(include='object').columns.tolist()
334
+ if not cat_cols:
335
+ st.warning("No categorical columns detected. Please ensure you have a categorical column for this analysis.")
336
+ else:
337
+ group_col = st.selectbox("Grouping Variable", cat_cols)
338
+ value_col = st.selectbox("Metric Variable",
339
+ st.session_state.data.select_dtypes(include=np.number).columns)
340
+
341
+ if group_col and value_col:
342
+ result = hypothesis_testing.invoke({
343
+ "data_key": "data",
344
+ "group_col": group_col,
345
+ "value_col": value_col
346
+ })
347
+ st.subheader("Statistical Test Results")
348
+ st.json(result)
349
 
350
  elif analysis_type == "Distribution Analysis":
351
  num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
 
355
  "data_key": "data",
356
  "columns": selected_cols
357
  })
358
+ st.image(f"data:image/png;base64,{img_data}",
359
+ use_column_width=True)
360
 
361
  with research_tab:
362
  research_query = st.text_area("Enter Research Question:", height=150,