mgbam commited on
Commit
3280b05
·
verified ·
1 Parent(s): 2cebaf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -182
app.py CHANGED
@@ -12,76 +12,60 @@ from typing import Dict, List, Optional
12
  from langchain.tools import tool
13
  from langchain.agents import initialize_agent, AgentType
14
  from scipy.stats import ttest_ind, f_oneway
15
- from statsmodels.tsa.seasonal import seasonal_decompose
16
- from statsmodels.tsa.stattools import adfuller
17
- from jinja2 import Template
18
 
19
  # Initialize Groq Client
20
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
21
 
22
-
23
  class ResearchInput(BaseModel):
24
- """Base schema for research tool inputs, ensuring type and description integrity."""
25
- data_key: str = Field(..., description="Session state key containing the DataFrame.")
26
- columns: Optional[List[str]] = Field(None, description="List of column names to analyze.")
27
-
28
 
29
  class TemporalAnalysisInput(ResearchInput):
30
- """Schema for temporal analysis inputs, focusing on specific time-series requirements."""
31
- time_col: str = Field(..., description="Name of the column containing timestamp data.")
32
- value_col: str = Field(..., description="Name of the column containing numerical values to analyze.")
33
-
34
 
35
  class HypothesisInput(ResearchInput):
36
- """Schema for hypothesis testing, demanding group and value specification for statistical rigor."""
37
- group_col: str = Field(..., description="Categorical column defining the groups for comparison.")
38
- value_col: str = Field(..., description="Numerical column for comparing means across groups.")
39
-
40
 
41
  class GroqResearcher:
42
- """
43
- A sophisticated AI research engine powered by Groq, designed for rigorous academic-style analysis.
44
- This class handles complex data queries and delivers structured research outputs.
45
- """
46
-
47
  def __init__(self, model_name="mixtral-8x7b-32768"):
48
  self.model_name = model_name
49
- self.system_template = """
50
- You are a senior data scientist at a prestigious research institution. Your analysis must
51
- adhere to rigorous scientific standards. Consider the dataset properties and the user's query.
52
 
53
- Dataset Context:
54
- - Dimensions: {{ dataset_shape }}
55
- - Variables: {{ dataset_variables }}
56
- - Temporal Coverage: {{ temporal_coverage }}
57
- - Missing Value Counts: {{ missing_values }}
58
-
59
- User Inquiry: {{ query }}
60
-
61
- Response Structure (Critical for all analyses):
62
- 1. **Executive Summary:** Provide a 1-2 paragraph overview of the findings, contextualized within the dataset's characteristics.
63
- 2. **Methodology:** Detail the exact analysis techniques used, including statistical tests or model types, and their justification.
64
- 3. **Key Findings:** Present the most significant observations and statistical results (p-values, effect sizes) with proper interpretation.
65
- 4. **Limitations:** Acknowledge and describe the constraints of the dataset or analytical methods that might affect the results' interpretation or generalizability.
66
- 5. **Recommended Next Steps:** Suggest future studies, experiments, or analyses that could extend the current investigation and address the noted limitations.
67
-
68
- """
69
 
70
  def research(self, query: str, data: pd.DataFrame) -> str:
71
- """Executes in-depth research using the Groq API to produce academic-quality analyses."""
72
  try:
73
- dataset_info = {
74
- "dataset_shape": str(data.shape),
75
- "dataset_variables": ", ".join(data.columns),
76
- "temporal_coverage": str(data.select_dtypes(include='datetime').columns.tolist()),
77
- "missing_values": str(data.isnull().sum().to_dict()),
78
- }
79
-
80
- prompt = Template(self.system_template).render(**dataset_info, query=query)
81
-
 
 
 
82
  completion = client.chat.completions.create(
83
  messages=[
84
- {"role": "system", "content": "You are a research AI assistant."},
85
  {"role": "user", "content": prompt}
86
  ],
87
  model=self.model_name,
@@ -89,22 +73,20 @@ class GroqResearcher:
89
  max_tokens=4096,
90
  stream=False
91
  )
 
92
  return completion.choices[0].message.content
 
93
  except Exception as e:
94
- return f"Research Error Encountered: {str(e)}"
95
-
96
 
97
  @tool(args_schema=ResearchInput)
98
  def advanced_eda(data_key: str) -> Dict:
99
- """
100
- Performs a comprehensive Exploratory Data Analysis, including statistical profiling,
101
- temporal analysis of datetime columns, and detailed quality checks.
102
- """
103
  try:
104
  data = st.session_state[data_key]
105
  analysis = {
106
  "dimensionality": {
107
- "rows": int(len(data)), # Ensure rows are an integer
108
  "columns": list(data.columns),
109
  "memory_usage": f"{data.memory_usage().sum() / 1e6:.2f} MB"
110
  },
@@ -112,147 +94,112 @@ def advanced_eda(data_key: str) -> Dict:
112
  "temporal_analysis": {
113
  "date_ranges": {
114
  col: {
115
- "min": str(data[col].min()), # Ensure date is a string
116
- "max": str(data[col].max()) # Ensure date is a string
117
  } for col in data.select_dtypes(include='datetime').columns
118
  }
119
  },
120
  "data_quality": {
121
  "missing_values": data.isnull().sum().to_dict(),
122
- "duplicates": int(data.duplicated().sum()), # Ensure duplicates are an integer
123
  "cardinality": {
124
- col: int(data[col].nunique()) for col in data.columns # Ensure cardinality is integer
125
  }
126
  }
127
  }
128
  return analysis
129
  except Exception as e:
130
- return {"error": f"Advanced EDA Failed: {str(e)}"}
131
 
132
  @tool(args_schema=ResearchInput)
133
  def visualize_distributions(data_key: str, columns: List[str]) -> str:
134
- """
135
- Generates high-quality, publication-ready distribution visualizations (histograms with KDE)
136
- for selected numerical columns, and returns the image as a base64 encoded string.
137
- """
138
  try:
139
  data = st.session_state[data_key]
140
- plt.figure(figsize=(15, 7)) # Adjusted figure size for better readability
141
  for i, col in enumerate(columns, 1):
142
  plt.subplot(1, len(columns), i)
143
- sns.histplot(data[col], kde=True, stat="density", color=sns.color_palette()[i % len(sns.color_palette())])
144
- plt.title(f'Distribution of {col}', fontsize=14, fontweight='bold') # Enhanced title
145
- plt.xlabel(col, fontsize=12)
146
- plt.ylabel('Density', fontsize=12)
147
- plt.xticks(fontsize=10)
148
- plt.yticks(fontsize=10)
149
- plt.grid(axis='y', linestyle='--')
150
- sns.despine(top=True, right=True) # Improved styling
151
- plt.tight_layout(pad=2) # Added padding for tight layout
152
 
153
  buf = io.BytesIO()
154
  plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
155
  plt.close()
156
  return base64.b64encode(buf.getvalue()).decode()
157
  except Exception as e:
158
- return f"Distribution Visualization Error: {str(e)}"
159
-
160
 
161
  @tool(args_schema=TemporalAnalysisInput)
162
  def temporal_analysis(data_key: str, time_col: str, value_col: str) -> Dict:
163
- """
164
- Performs a sophisticated time series analysis, including decomposition and trend assessment,
165
- providing both statistical insights and a visual representation.
166
- """
167
  try:
168
  data = st.session_state[data_key]
169
- ts_data = data.set_index(pd.to_datetime(data[time_col]))[value_col].dropna() # Handle NaNs
170
-
171
- if ts_data.empty:
172
- return {"error": "No valid time series data found for analysis after NaN removal."}
173
-
174
- decomposition = seasonal_decompose(ts_data, model='additive', period=min(len(ts_data), 365) if len(ts_data) > 10 else 1)
175
-
176
- plt.figure(figsize=(16, 10))
177
  decomposition.plot()
178
  plt.tight_layout()
179
-
180
  buf = io.BytesIO()
181
- plt.savefig(buf, format='png', dpi=300) # Increased dpi for higher resolution
182
  plt.close()
183
  plot_data = base64.b64encode(buf.getvalue()).decode()
184
-
185
- adf_result = adfuller(ts_data)
186
- stationarity_p_value = adf_result[1]
187
-
188
  return {
189
  "trend_statistics": {
190
- "stationarity": stationarity_p_value,
191
- "stationarity_interpretation": interpret_p_value(stationarity_p_value),
192
- "seasonality_strength": max(decomposition.seasonal) if hasattr(decomposition, 'seasonal') else None
193
  },
194
- "visualization": plot_data,
195
- "decomposition_data": {
196
- "trend": decomposition.trend.dropna().to_dict() if hasattr(decomposition, 'trend') else None,
197
- "seasonal": decomposition.seasonal.dropna().to_dict() if hasattr(decomposition, 'seasonal') else None,
198
- "residual": decomposition.resid.dropna().to_dict() if hasattr(decomposition, 'resid') else None,
199
- }
200
  }
201
  except Exception as e:
202
- return {"error": f"Temporal Analysis Failure: {str(e)}"}
203
 
204
  @tool(args_schema=HypothesisInput)
205
  def hypothesis_testing(data_key: str, group_col: str, value_col: str) -> Dict:
206
- """
207
- Conducts statistical hypothesis testing, providing detailed test results, effect size measures,
208
- and interpretations for both t-tests and ANOVAs.
209
- """
210
  try:
211
  data = st.session_state[data_key]
212
  groups = data[group_col].unique()
213
 
214
  if len(groups) < 2:
215
- return {"error": "Insufficient groups for comparison. Must have at least two groups."}
216
-
217
- group_data = [data[data[group_col] == g][value_col].dropna() for g in groups]
218
-
219
- if any(len(group) < 2 for group in group_data):
220
- return {"error": "Each group must have at least two data points for testing."}
221
 
222
  if len(groups) == 2:
 
223
  stat, p = ttest_ind(*group_data)
224
  test_type = "Independent t-test"
225
  else:
 
226
  stat, p = f_oneway(*group_data)
227
  test_type = "ANOVA"
228
 
229
- effect_size = None
230
- if len(groups) == 2:
231
- pooled_variance = np.sqrt((group_data[0].var() + group_data[1].var()) / 2)
232
- if pooled_variance != 0:
233
- cohens_d = abs(group_data[0].mean() - group_data[1].mean()) / pooled_variance
234
- effect_size = {"cohens_d": cohens_d}
235
- else:
236
- effect_size = {"cohens_d": None, "error": "Cannot compute effect size due to zero pooled variance."}
237
-
238
  return {
239
  "test_type": test_type,
240
- "test_statistic": float(stat), # Ensure stat is a float
241
- "p_value": float(p), # Ensure p_value is a float
242
- "effect_size": effect_size,
243
- "interpretation": interpret_p_value(p),
244
- "group_means": {g: float(data[data[group_col] == g][value_col].mean()) for g in groups} # Group Means
 
 
 
245
  }
246
  except Exception as e:
247
  return {"error": f"Hypothesis Testing Failed: {str(e)}"}
248
 
249
  def interpret_p_value(p: float) -> str:
250
- """Provides nuanced interpretations of p-values, including qualitative descriptors."""
251
- if p < 0.001: return "Highly significant evidence against the null hypothesis (p < 0.001)."
252
- elif p < 0.01: return "Strong evidence against the null hypothesis (0.001 ≤ p < 0.01)."
253
- elif p < 0.05: return "Moderate evidence against the null hypothesis (0.01 ≤ p < 0.05)."
254
- elif p < 0.1: return "Weak evidence against the null hypothesis (0.05 ≤ p < 0.1)."
255
- else: return "No significant evidence against the null hypothesis (p ≥ 0.1)."
256
 
257
  def main():
258
  st.set_page_config(page_title="AI Research Lab", layout="wide")
@@ -270,12 +217,9 @@ def main():
270
  uploaded_file = st.file_uploader("Upload research dataset", type=["csv", "parquet"])
271
  if uploaded_file:
272
  with st.spinner("Initializing dataset..."):
273
- try:
274
- st.session_state.data = pd.read_csv(uploaded_file)
275
- st.success(f"Loaded {len(st.session_state.data):,} research observations")
276
- except Exception as e:
277
- st.error(f"Error loading the dataset. Please ensure it's a valid CSV or Parquet format. Error details: {e}")
278
-
279
  # Main research interface
280
  if st.session_state.data is not None:
281
  col1, col2 = st.columns([1, 3])
@@ -286,10 +230,10 @@ def main():
286
  "Variables": list(st.session_state.data.columns),
287
  "Time Range": {
288
  col: {
289
- "min": str(st.session_state.data[col].min()),
290
- "max": str(st.session_state.data[col].max())
291
  } for col in st.session_state.data.select_dtypes(include='datetime').columns
292
- } if st.session_state.data.select_dtypes(include='datetime').columns.tolist() else "No Temporal Data",
293
  "Size": f"{st.session_state.data.memory_usage().sum() / 1e6:.2f} MB"
294
  })
295
 
@@ -310,42 +254,35 @@ def main():
310
  st.json(eda_result)
311
 
312
  elif analysis_type == "Temporal Pattern Analysis":
313
- time_cols = st.session_state.data.select_dtypes(include='datetime').columns.tolist()
314
- if not time_cols:
315
- st.warning("No datetime columns detected. Please ensure you have a datetime column for this analysis.")
316
- else:
317
- time_col = st.selectbox("Temporal Variable", time_cols)
318
- value_col = st.selectbox("Analysis Variable",
319
- st.session_state.data.select_dtypes(include=np.number).columns)
320
-
321
- if time_col and value_col:
322
- result = temporal_analysis.invoke({
323
- "data_key": "data",
324
- "time_col": time_col,
325
- "value_col": value_col
326
- })
327
- if "visualization" in result:
328
- st.image(f"data:image/png;base64,{result['visualization']}",
329
- use_column_width=True)
330
- st.json(result)
331
 
332
  elif analysis_type == "Comparative Statistics":
333
- cat_cols = st.session_state.data.select_dtypes(include='category').columns.tolist() + st.session_state.data.select_dtypes(include='object').columns.tolist()
334
- if not cat_cols:
335
- st.warning("No categorical columns detected. Please ensure you have a categorical column for this analysis.")
336
- else:
337
- group_col = st.selectbox("Grouping Variable", cat_cols)
338
- value_col = st.selectbox("Metric Variable",
339
- st.session_state.data.select_dtypes(include=np.number).columns)
340
-
341
- if group_col and value_col:
342
- result = hypothesis_testing.invoke({
343
- "data_key": "data",
344
- "group_col": group_col,
345
- "value_col": value_col
346
- })
347
- st.subheader("Statistical Test Results")
348
- st.json(result)
349
 
350
  elif analysis_type == "Distribution Analysis":
351
  num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
@@ -355,8 +292,7 @@ def main():
355
  "data_key": "data",
356
  "columns": selected_cols
357
  })
358
- st.image(f"data:image/png;base64,{img_data}",
359
- use_column_width=True)
360
 
361
  with research_tab:
362
  research_query = st.text_area("Enter Research Question:", height=150,
 
12
  from langchain.tools import tool
13
  from langchain.agents import initialize_agent, AgentType
14
  from scipy.stats import ttest_ind, f_oneway
 
 
 
15
 
16
  # Initialize Groq Client
17
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
18
 
 
19
  class ResearchInput(BaseModel):
20
+ """Base schema for research tool inputs"""
21
+ data_key: str = Field(..., description="Session state key containing DataFrame")
22
+ columns: Optional[List[str]] = Field(None, description="List of columns to analyze")
 
23
 
24
  class TemporalAnalysisInput(ResearchInput):
25
+ """Schema for temporal analysis"""
26
+ time_col: str = Field(..., description="Name of timestamp column")
27
+ value_col: str = Field(..., description="Name of value column to analyze")
 
28
 
29
  class HypothesisInput(ResearchInput):
30
+ """Schema for hypothesis testing"""
31
+ group_col: str = Field(..., description="Categorical column defining groups")
32
+ value_col: str = Field(..., description="Numerical column to compare")
 
33
 
34
  class GroqResearcher:
35
+ """Advanced AI Research Engine using Groq"""
 
 
 
 
36
  def __init__(self, model_name="mixtral-8x7b-32768"):
37
  self.model_name = model_name
38
+ self.system_template = """You are a senior data scientist at a research institution.
39
+ Analyze this dataset with rigorous statistical methods and provide academic-quality insights:
40
+ {dataset_info}
41
 
42
+ User Question: {query}
43
+
44
+ Required Format:
45
+ - Executive Summary (1 paragraph)
46
+ - Methodology (bullet points)
47
+ - Key Findings (numbered list)
48
+ - Limitations
49
+ - Recommended Next Steps"""
 
 
 
 
 
 
 
 
50
 
51
  def research(self, query: str, data: pd.DataFrame) -> str:
52
+ """Conduct academic-level analysis using Groq"""
53
  try:
54
+ dataset_info = f"""
55
+ Dataset Dimensions: {data.shape}
56
+ Variables: {', '.join(data.columns)}
57
+ Temporal Coverage: {data.select_dtypes(include='datetime').columns.tolist()}
58
+ Missing Values: {data.isnull().sum().to_dict()}
59
+ """
60
+
61
+ prompt = PromptTemplate.from_template(self.system_template).format(
62
+ dataset_info=dataset_info,
63
+ query=query
64
+ )
65
+
66
  completion = client.chat.completions.create(
67
  messages=[
68
+ {"role": "system", "content": "You are a research AI assistant"},
69
  {"role": "user", "content": prompt}
70
  ],
71
  model=self.model_name,
 
73
  max_tokens=4096,
74
  stream=False
75
  )
76
+
77
  return completion.choices[0].message.content
78
+
79
  except Exception as e:
80
+ return f"Research Error: {str(e)}"
 
81
 
82
  @tool(args_schema=ResearchInput)
83
  def advanced_eda(data_key: str) -> Dict:
84
+ """Comprehensive Exploratory Data Analysis with Statistical Profiling"""
 
 
 
85
  try:
86
  data = st.session_state[data_key]
87
  analysis = {
88
  "dimensionality": {
89
+ "rows": len(data),
90
  "columns": list(data.columns),
91
  "memory_usage": f"{data.memory_usage().sum() / 1e6:.2f} MB"
92
  },
 
94
  "temporal_analysis": {
95
  "date_ranges": {
96
  col: {
97
+ "min": data[col].min(),
98
+ "max": data[col].max()
99
  } for col in data.select_dtypes(include='datetime').columns
100
  }
101
  },
102
  "data_quality": {
103
  "missing_values": data.isnull().sum().to_dict(),
104
+ "duplicates": data.duplicated().sum(),
105
  "cardinality": {
106
+ col: data[col].nunique() for col in data.columns
107
  }
108
  }
109
  }
110
  return analysis
111
  except Exception as e:
112
+ return {"error": f"EDA Failed: {str(e)}"}
113
 
114
  @tool(args_schema=ResearchInput)
115
  def visualize_distributions(data_key: str, columns: List[str]) -> str:
116
+ """Generate publication-quality distribution visualizations"""
 
 
 
117
  try:
118
  data = st.session_state[data_key]
119
+ plt.figure(figsize=(12, 6))
120
  for i, col in enumerate(columns, 1):
121
  plt.subplot(1, len(columns), i)
122
+ sns.histplot(data[col], kde=True, stat="density")
123
+ plt.title(f'Distribution of {col}', fontsize=10)
124
+ plt.xticks(fontsize=8)
125
+ plt.yticks(fontsize=8)
126
+ plt.tight_layout()
 
 
 
 
127
 
128
  buf = io.BytesIO()
129
  plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
130
  plt.close()
131
  return base64.b64encode(buf.getvalue()).decode()
132
  except Exception as e:
133
+ return f"Visualization Error: {str(e)}"
 
134
 
135
  @tool(args_schema=TemporalAnalysisInput)
136
  def temporal_analysis(data_key: str, time_col: str, value_col: str) -> Dict:
137
+ """Time Series Decomposition and Trend Analysis"""
 
 
 
138
  try:
139
  data = st.session_state[data_key]
140
+ ts_data = data.set_index(pd.to_datetime(data[time_col]))[value_col]
141
+
142
+ decomposition = seasonal_decompose(ts_data, period=365)
143
+
144
+ plt.figure(figsize=(12, 8))
 
 
 
145
  decomposition.plot()
146
  plt.tight_layout()
147
+
148
  buf = io.BytesIO()
149
+ plt.savefig(buf, format='png')
150
  plt.close()
151
  plot_data = base64.b64encode(buf.getvalue()).decode()
152
+
 
 
 
153
  return {
154
  "trend_statistics": {
155
+ "stationarity": adfuller(ts_data)[1],
156
+ "seasonality_strength": max(decomposition.seasonal)
 
157
  },
158
+ "visualization": plot_data
 
 
 
 
 
159
  }
160
  except Exception as e:
161
+ return {"error": f"Temporal Analysis Failed: {str(e)}"}
162
 
163
  @tool(args_schema=HypothesisInput)
164
  def hypothesis_testing(data_key: str, group_col: str, value_col: str) -> Dict:
165
+ """Statistical Hypothesis Testing with Automated Assumption Checking"""
 
 
 
166
  try:
167
  data = st.session_state[data_key]
168
  groups = data[group_col].unique()
169
 
170
  if len(groups) < 2:
171
+ return {"error": "Insufficient groups for comparison"}
 
 
 
 
 
172
 
173
  if len(groups) == 2:
174
+ group_data = [data[data[group_col] == g][value_col] for g in groups]
175
  stat, p = ttest_ind(*group_data)
176
  test_type = "Independent t-test"
177
  else:
178
+ group_data = [data[data[group_col] == g][value_col] for g in groups]
179
  stat, p = f_oneway(*group_data)
180
  test_type = "ANOVA"
181
 
 
 
 
 
 
 
 
 
 
182
  return {
183
  "test_type": test_type,
184
+ "test_statistic": stat,
185
+ "p_value": p,
186
+ "effect_size": {
187
+ "cohens_d": abs(group_data[0].mean() - group_data[1].mean())/np.sqrt(
188
+ (group_data[0].var() + group_data[1].var())/2
189
+ ) if len(groups) == 2 else None
190
+ },
191
+ "interpretation": interpret_p_value(p)
192
  }
193
  except Exception as e:
194
  return {"error": f"Hypothesis Testing Failed: {str(e)}"}
195
 
196
  def interpret_p_value(p: float) -> str:
197
+ """Scientific interpretation of p-values"""
198
+ if p < 0.001: return "Very strong evidence against H0"
199
+ elif p < 0.01: return "Strong evidence against H0"
200
+ elif p < 0.05: return "Evidence against H0"
201
+ elif p < 0.1: return "Weak evidence against H0"
202
+ else: return "No significant evidence against H0"
203
 
204
  def main():
205
  st.set_page_config(page_title="AI Research Lab", layout="wide")
 
217
  uploaded_file = st.file_uploader("Upload research dataset", type=["csv", "parquet"])
218
  if uploaded_file:
219
  with st.spinner("Initializing dataset..."):
220
+ st.session_state.data = pd.read_csv(uploaded_file)
221
+ st.success(f"Loaded {len(st.session_state.data):,} research observations")
222
+
 
 
 
223
  # Main research interface
224
  if st.session_state.data is not None:
225
  col1, col2 = st.columns([1, 3])
 
230
  "Variables": list(st.session_state.data.columns),
231
  "Time Range": {
232
  col: {
233
+ "min": st.session_state.data[col].min(),
234
+ "max": st.session_state.data[col].max()
235
  } for col in st.session_state.data.select_dtypes(include='datetime').columns
236
+ },
237
  "Size": f"{st.session_state.data.memory_usage().sum() / 1e6:.2f} MB"
238
  })
239
 
 
254
  st.json(eda_result)
255
 
256
  elif analysis_type == "Temporal Pattern Analysis":
257
+ time_col = st.selectbox("Temporal Variable",
258
+ st.session_state.data.select_dtypes(include='datetime').columns)
259
+ value_col = st.selectbox("Analysis Variable",
260
+ st.session_state.data.select_dtypes(include=np.number).columns)
261
+
262
+ if time_col and value_col:
263
+ result = temporal_analysis.invoke({
264
+ "data_key": "data",
265
+ "time_col": time_col,
266
+ "value_col": value_col
267
+ })
268
+ if "visualization" in result:
269
+ st.image(f"data:image/png;base64,{result['visualization']}")
270
+ st.json(result)
 
 
 
 
271
 
272
  elif analysis_type == "Comparative Statistics":
273
+ group_col = st.selectbox("Grouping Variable",
274
+ st.session_state.data.select_dtypes(include='category').columns)
275
+ value_col = st.selectbox("Metric Variable",
276
+ st.session_state.data.select_dtypes(include=np.number).columns)
277
+
278
+ if group_col and value_col:
279
+ result = hypothesis_testing.invoke({
280
+ "data_key": "data",
281
+ "group_col": group_col,
282
+ "value_col": value_col
283
+ })
284
+ st.subheader("Statistical Test Results")
285
+ st.json(result)
 
 
 
286
 
287
  elif analysis_type == "Distribution Analysis":
288
  num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
 
292
  "data_key": "data",
293
  "columns": selected_cols
294
  })
295
+ st.image(f"data:image/png;base64,{img_data}")
 
296
 
297
  with research_tab:
298
  research_query = st.text_area("Enter Research Question:", height=150,