Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / app.py

mgbam

Update app.py

f7b84f1 verified about 1 month ago

raw

history blame

19.5 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import io
	import json
	import warnings
	import google.generativeai as genai
	import os
	import logging
	from contextlib import redirect_stdout
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
	from sklearn.linear_model import LogisticRegression, LinearRegression
	from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error
	from sklearn.preprocessing import LabelEncoder

	# --- Configuration ---
	warnings.filterwarnings('ignore')
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	THEME = gr.themes.Glass(primary_hue="blue", secondary_hue="cyan").set(
	body_background_fill="rgba(0,0,0,0.8)",
	block_background_fill="rgba(0,0,0,0.6)",
	block_border_width="1px",
	border_color_primary="rgba(255,255,255,0.1)"
	)
	MODEL_REGISTRY = {
	"Classification": {"Random Forest": RandomForestClassifier, "Logistic Regression": LogisticRegression},
	"Regression": {"Random Forest": RandomForestRegressor, "Linear Regression": LinearRegression}
	}

	# --- Core Logic ---

	def safe_exec(code_string: str, local_vars: dict) -> tuple:
	"""Safely execute a string of Python code and capture its output."""
	output_buffer = io.StringIO()
	try:
	with redirect_stdout(output_buffer):
	exec(code_string, globals(), local_vars)
	stdout = output_buffer.getvalue()
	fig = local_vars.get('fig')
	df_out = local_vars.get('df_result')
	return stdout, fig, df_out, None
	except Exception as e:
	return None, None, None, f"Execution Error: {str(e)}"

	def prime_data(file_obj):
	"""Loads, analyzes, and primes the entire application state upon file upload."""
	if not file_obj:
	return {gr.update(visible=False): None}

	try:
	df = pd.read_csv(file_obj.name)

	# Smart type conversion
	for col in df.select_dtypes(include=['object']).columns:
	try:
	df[col] = pd.to_datetime(df[col], errors='raise')
	except (ValueError, TypeError):
	if df[col].nunique() / len(df) < 0.5: # If not too many unique values
	df[col] = df[col].astype('category')

	# --- Phoenix Eye: Proactive Insights Engine ---
	insights = {}
	metadata = extract_dataset_metadata(df)

	# 1. Missing Data
	missing = df.isnull().sum()
	insights['missing'] = missing[missing > 0].sort_values(ascending=False)

	# 2. High Cardinality
	insights['high_cardinality'] = {c: df[c].nunique() for c in metadata['categorical_cols'] if df[c].nunique() > 50}

	# 3. High Correlations
	if len(metadata['numeric_cols']) > 1:
	corr = df[metadata['numeric_cols']].corr().abs()
	sol = corr.unstack()
	so = sol.sort_values(kind="quicksort", ascending=False)
	so = so[so < 1] # Remove self-correlation
	insights['high_correlations'] = so.head(5)

	# 4. Outlier Detection (IQR method)
	outliers = {}
	for col in metadata['numeric_cols']:
	Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
	IQR = Q3 - Q1
	outlier_count = ((df[col] < (Q1 - 1.5 * IQR)) \| (df[col] > (Q3 + 1.5 * IQR))).sum()
	if outlier_count > 0:
	outliers[col] = outlier_count
	insights['outliers'] = outliers

	# 5. ML Target Suggestion
	suggestions = []
	for col in metadata['categorical_cols']:
	if df[col].nunique() == 2:
	suggestions.append(f"{col} (Binary Classification)")
	for col in metadata['numeric_cols']:
	if df[col].nunique() > 20: # Heuristic for continuous target
	suggestions.append(f"{col} (Regression)")
	insights['ml_suggestions'] = suggestions

	state = {
	'df_original': df,
	'df_modified': df.copy(),
	'filename': os.path.basename(file_obj.name),
	'metadata': metadata,
	'proactive_insights': insights
	}

	# Generate UI updates
	overview_md = generate_phoenix_eye_markdown(state)
	all_cols = metadata['columns']
	num_cols = metadata['numeric_cols']
	cat_cols = metadata['categorical_cols']

	return {
	global_state: state,
	phoenix_tabs: gr.update(visible=True),
	phoenix_eye_output: overview_md,
	# Data Medic updates
	medic_col_select: gr.update(choices=insights['missing'].index.tolist() or [], interactive=True),
	# Oracle updates
	oracle_target_select: gr.update(choices=all_cols, interactive=True),
	oracle_feature_select: gr.update(choices=all_cols, interactive=True),
	}

	except Exception as e:
	logging.error(f"Priming Error: {e}")
	return {phoenix_eye_output: gr.update(value=f"❌ Error: {e}")}

	def extract_dataset_metadata(df):
	"""Extracts typed metadata from a DataFrame."""
	rows, cols = df.shape
	return {
	'shape': (rows, cols),
	'columns': df.columns.tolist(),
	'numeric_cols': df.select_dtypes(include=np.number).columns.tolist(),
	'categorical_cols': df.select_dtypes(include=['object', 'category']).columns.tolist(),
	'datetime_cols': df.select_dtypes(include=['datetime64', 'datetime64[ns]']).columns.tolist(),
	'dtypes': df.dtypes.apply(lambda x: x.name).to_dict()
	}

	def generate_phoenix_eye_markdown(state):
	"""Creates the markdown for the proactive insights dashboard."""
	insights = state['proactive_insights']
	md = f"## 🦅 Phoenix Eye: Proactive Insights for `{state['filename']}`\n"
	md += f"Dataset has {state['metadata']['shape'][0]} rows and {state['metadata']['shape'][1]} columns.\n\n"

	# ML Suggestions
	md += "### 🔮 Potential ML Targets\n"
	if insights['ml_suggestions']:
	for s in insights['ml_suggestions']: md += f"- `{s}`\n"
	else: md += "No obvious ML target columns found.\n"
	md += "\n"

	# Missing Data
	md += "### 💧 Missing Data\n"
	if not insights['missing'].empty:
	md += "Found missing values in these columns. Use the Data Medic tab to fix.\n"
	md += insights['missing'].to_frame('Missing Count').to_markdown() + "\n"
	else: md += "✅ No missing data found!\n"
	md += "\n"

	# High Correlation
	md += "### 🔗 Top Correlations\n"
	if 'high_correlations' in insights and not insights['high_correlations'].empty:
	md += insights['high_correlations'].to_frame('Correlation').to_markdown() + "\n"
	else: md += "No strong correlations found between numeric features.\n"
	md += "\n"

	# Outliers
	md += "### 📈 Outlier Alert\n"
	if insights['outliers']:
	for col, count in insights['outliers'].items(): md += f"- `{col}` has {count} potential outliers.\n"
	else: md += "✅ No significant outliers detected.\n"
	md += "\n"

	# High Cardinality
	md += "### 🇇 High Cardinality Warning\n"
	if insights['high_cardinality']:
	for col, count in insights['high_cardinality'].items(): md += f"- `{col}` has {count} unique values, which may be problematic for some models.\n"
	else: md += "✅ No high-cardinality categorical columns found.\n"
	md += "\n"

	return md

	# --- Tab Handlers ---

	def medic_preview_imputation(state, col, method):
	"""Shows a before-and-after plot for data imputation."""
	if not col: return None
	df_orig = state['df_original']
	df_mod = df_orig.copy()

	if method == 'mean': value = df_mod[col].mean()
	elif method == 'median': value = df_mod[col].median()
	else: value = df_mod[col].mode()[0]

	df_mod[col] = df_mod[col].fillna(value)

	fig = go.Figure()
	fig.add_trace(go.Histogram(x=df_orig[col], name='Before', opacity=0.7))
	fig.add_trace(go.Histogram(x=df_mod[col], name='After', opacity=0.7))
	fig.update_layout(barmode='overlay', title=f"'{col}' Distribution: Before vs. After Imputation", legend_title_text='Dataset')
	return fig

	def medic_apply_imputation(state, col, method):
	"""Applies imputation and updates the main state."""
	if not col: return state, "No column selected."
	df_mod = state['df_modified'].copy()

	if method == 'mean': value = df_mod[col].mean()
	elif method == 'median': value = df_mod[col].median()
	else: value = df_mod[col].mode()[0]

	df_mod[col] = df_mod[col].fillna(value)
	state['df_modified'] = df_mod

	# Re-run proactive insights on the modified df
	state['proactive_insights']['missing'] = df_mod.isnull().sum()
	state['proactive_insights']['missing'] = state['proactive_insights']['missing'][state['proactive_insights']['missing'] > 0]

	return state, f"✅ Applied '{method}' imputation to '{col}'.", gr.update(choices=state['proactive_insights']['missing'].index.tolist())

	def download_cleaned_data(state):
	"""Saves the modified dataframe to a csv and returns the path."""
	if state:
	df = state['df_modified']
	# Gradio handles the tempfile creation
	return gr.File.update(value=df.to_csv(index=False), visible=True)
	return gr.File.update(visible=False)

	def oracle_run_model(state, target, features, model_name):
	"""Trains a simple ML model and returns metrics and plots."""
	if not target or not features: return None, None, "Please select a target and at least one feature."

	df = state['df_modified'].copy()

	# Preprocessing
	df.dropna(subset=features + [target], inplace=True)
	if df.empty: return None, None, "Not enough data after dropping NA values."

	le = LabelEncoder()
	for col in features + [target]:
	if df[col].dtype == 'object' or df[col].dtype.name == 'category':
	df[col] = le.fit_transform(df[col])

	X = df[features]
	y = df[target]

	problem_type = "Classification" if y.nunique() <= 10 else "Regression"

	if model_name not in MODEL_REGISTRY[problem_type]:
	return None, None, f"Model {model_name} not suitable for {problem_type}."

	model = MODEL_REGISTRY[problem_type][model_name](random_state=42)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
	model.fit(X_train, y_train)
	preds = model.predict(X_test)

	# Results
	if problem_type == "Classification":
	acc = accuracy_score(y_test, preds)
	cm = confusion_matrix(y_test, preds)
	cm_fig = px.imshow(cm, text_auto=True, title=f"Confusion Matrix (Accuracy: {acc:.2f})")

	if hasattr(model, 'feature_importances_'):
	fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
	fi_fig = px.bar(fi, title="Feature Importance")
	return fi_fig, cm_fig, f"Classification Report:\n- Accuracy: {acc:.2f}"
	else:
	return None, cm_fig, f"Classification Report:\n- Accuracy: {acc:.2f}"

	else: # Regression
	r2 = r2_score(y_test, preds)
	rmse = np.sqrt(mean_squared_error(y_test, preds))

	preds_fig = px.scatter(x=y_test, y=preds, labels={'x': 'Actual Values', 'y': 'Predicted Values'},
	title=f"Predictions vs. Actuals (R²: {r2:.2f})", trendline='ols')

	if hasattr(model, 'feature_importances_'):
	fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
	fi_fig = px.bar(fi, title="Feature Importance")
	return fi_fig, preds_fig, f"Regression Report:\n- R² Score: {r2:.2f}\n- RMSE: {rmse:.2f}"
	else:
	return None, preds_fig, f"Regression Report:\n- R² Score: {r2:.2f}\n- RMSE: {rmse:.2f}"

	def copilot_respond(user_message, history, state, api_key):
	"""Handles the AI Co-pilot chat interaction."""
	if not api_key:
	return history + [(user_message, "I need a Gemini API key to function.")], None, None, ""

	history += [(user_message, None)]

	prompt = f"""
	You are 'Phoenix Co-pilot', a world-class AI data analyst. Your goal is to help the user by writing and executing Python code.
	You have access to a pandas DataFrame named `df`. This is the user's LATEST data, including any cleaning they've performed.

	DataFrame Info:
	- Columns and dtypes: {json.dumps(state['metadata']['dtypes'])}

	Instructions:
	1. Analyze the user's request: '{user_message}'.
	2. Formulate a plan (thought).
	3. Write Python code to execute the plan.
	4. Use `pandas`, `numpy`, and `plotly.express as px`.
	5. To show a plot, assign it to a variable `fig`. Ex: `fig = px.histogram(df, x='age')`.
	6. To show a dataframe, assign it to a variable `df_result`. Ex: `df_result = df.describe()`.
	7. Use `print()` for text output.
	8. NEVER modify `df` in place. Use `df.copy()` if needed.
	9. Respond ONLY with a single, valid JSON object with keys "thought" and "code".

	User Request: "{user_message}"

	Your JSON Response:
	"""

	try:
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-flash')
	response = model.generate_content(prompt)

	# Clean and parse JSON
	response_json = json.loads(response.text.strip().replace("```json", "").replace("```", ""))
	thought = response_json.get("thought", "Thinking...")
	code_to_run = response_json.get("code", "print('No code generated.')")

	bot_thinking = f"🧠 Thinking: {thought}"
	history[-1] = (user_message, bot_thinking)
	yield history, None, None, gr.update(value=code_to_run)

	# Execute Code
	local_vars = {'df': state['df_modified'], 'px': px, 'pd': pd, 'np': np}
	stdout, fig_result, df_result, error = safe_exec(code_to_run, local_vars)

	bot_response = bot_thinking + "\n\n---\n\n"

	if error:
	bot_response += f"💥 Execution Error:\n```\n{error}\n```"
	if stdout:
	bot_response += f"📋 Output:\n```\n{stdout}\n```"
	if not error and not stdout and not fig_result and not isinstance(df_result, pd.DataFrame):
	bot_response += "✅ Code executed, but produced no direct output."

	history[-1] = (user_message, bot_response)
	yield history, fig_result, df_result, gr.update(value=code_to_run)

	except Exception as e:
	error_msg = f"A critical error occurred: {e}. The AI may have returned invalid JSON. Check the generated code."
	history[-1] = (user_message, error_msg)
	yield history, None, None, ""

	# --- Gradio UI Construction ---

	with gr.Blocks(theme=THEME, title="Phoenix AI Data Explorer") as demo:
	global_state = gr.State({})

	gr.Markdown("# 🔥 Phoenix AI Data Explorer")
	gr.Markdown("The next-generation analytic tool. Upload your data to awaken the Phoenix.")

	with gr.Row():
	file_input = gr.File(label="📁 Upload CSV", file_types=[".csv"])
	api_key_input = gr.Textbox(label="🔑 Gemini API Key", type="password", placeholder="Enter Google AI Studio key...")

	with gr.Tabs(visible=False) as phoenix_tabs:
	with gr.Tab("🦅 Phoenix Eye"):
	phoenix_eye_output = gr.Markdown()

	with gr.Tab("🩺 Data Medic"):
	gr.Markdown("### Cleanse Your Data\nSelect a column with missing values and choose a method to fill them.")
	with gr.Row():
	medic_col_select = gr.Dropdown(label="Select Column to Clean")
	medic_method_select = gr.Radio(['mean', 'median', 'mode'], label="Imputation Method", value='mean')
	medic_preview_btn = gr.Button("📊 Preview Changes")
	medic_plot = gr.Plot()
	with gr.Row():
	medic_apply_btn = gr.Button("✅ Apply & Save Changes", variant="primary")
	medic_status = gr.Textbox(label="Status", interactive=False)
	with gr.Accordion("Download Cleaned Data", open=False):
	download_btn = gr.Button("⬇️ Download Cleaned CSV")
	download_file_output = gr.File(label="Download Link", visible=False)

	with gr.Tab("🔮 The Oracle (Predictive Modeling)"):
	gr.Markdown("### Glimpse the Future\nTrain a simple model to see the predictive power of your data.")
	with gr.Row():
	oracle_target_select = gr.Dropdown(label="🎯 Select Target Variable")
	oracle_feature_select = gr.Multiselect(label="✨ Select Features")
	oracle_model_select = gr.Dropdown(choices=["Random Forest", "Logistic Regression", "Linear Regression"], label="🧠 Select Model")
	oracle_run_btn = gr.Button("🚀 Train Model!", variant="primary")
	oracle_status = gr.Markdown()
	with gr.Row():
	oracle_fig1 = gr.Plot()
	oracle_fig2 = gr.Plot()

	with gr.Tab("🤖 AI Co-pilot"):
	gr.Markdown("### Your Conversational Analyst\nAsk any question about your data in plain English.")
	copilot_chatbot = gr.Chatbot(label="Chat History", height=400)
	with gr.Accordion("AI Generated Results", open=True):
	copilot_fig_output = gr.Plot()
	copilot_df_output = gr.Dataframe(interactive=False)
	with gr.Accordion("Generated Code", open=False):
	copilot_code_output = gr.Code(language="python", interactive=False)

	with gr.Row():
	copilot_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What's the correlation between age and salary?'", scale=4)
	copilot_submit_btn = gr.Button("Submit", variant="primary", scale=1)

	# --- Event Wiring ---
	file_input.upload(
	fn=prime_data,
	inputs=file_input,
	outputs=[global_state, phoenix_tabs, phoenix_eye_output, medic_col_select, oracle_target_select, oracle_feature_select],
	show_progress="full"
	)

	# Data Medic
	medic_preview_btn.click(medic_preview_imputation, [global_state, medic_col_select, medic_method_select], medic_plot)
	medic_apply_btn.click(medic_apply_imputation, [global_state, medic_col_select, medic_method_select], [global_state, medic_status, medic_col_select])
	download_btn.click(download_cleaned_data, [global_state], download_file_output)

	# Oracle
	oracle_run_btn.click(
	oracle_run_model,
	[global_state, oracle_target_select, oracle_feature_select, oracle_model_select],
	[oracle_fig1, oracle_fig2, oracle_status],
	show_progress="full"
	)

	# AI Co-pilot
	copilot_submit_btn.click(
	copilot_respond,
	[copilot_input, copilot_chatbot, global_state, api_key_input],
	[copilot_chatbot, copilot_fig_output, copilot_df_output, copilot_code_output]
	).then(lambda: "", copilot_input, copilot_input) # Clear input after submit

	if __name__ == "__main__":
	demo.launch(debug=True)