Spaces:

mgbam
/

BizIntel_AI

Sleeping

App Files Files Community

BizIntel_AI / app.py

mgbam

Update app.py

a875537 verified about 2 months ago

raw

history blame

6.65 kB

	"""app.py — BizIntel AI Ultra (Gemini‑only, v4)
	Robust BI copilot featuring:
	• CSV / Excel / Parquet uploads + live SQL ingestion
	• Memory‑safe sampling for large files (≤ 5 M rows sample)
	• Gemini‑generated narrative insights & quick schema audit
	• Interactive EDA (histogram + box, correlation heatmap)
	• Tunable ARIMA forecasting (p,d,q & horizon)
	• One‑click strategy brief download
	"""
	from __future__ import annotations
	import io, os, tempfile
	from pathlib import Path
	from typing import List

	import pandas as pd
	import plotly.express as px
	import streamlit as st
	from sqlalchemy import create_engine
	from statsmodels.tsa.arima.model import ARIMA
	import google.generativeai as genai

	# ───────────────────── 0 · PAGE CONFIG ──────────────────────
	st.set_page_config(
	page_title="BizIntel AI Ultra",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	auth_key = None
	try:
	auth_key = st.secrets["GEMINI_APIKEY"] # type: ignore[attr-defined]
	except Exception:
	auth_key = os.getenv("GEMINI_APIKEY")

	if not auth_key:
	st.error(
	"❌ GEMINI_APIKEY missing. Add it in Settings → Secrets or set env var before running."
	)
	st.stop()

	genai.configure(api_key=auth_key)
	GEM_MODEL = "gemini-1.5-pro-latest"
	TMP = Path(tempfile.gettempdir())

	# ───────────────────── 1 · HELPERS ──────────────────────────
	@st.cache_data(show_spinner=False)
	def load_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
	suf = Path(buf.name).suffix.lower()
	if suf in {".xls", ".xlsx"}:
	return pd.read_excel(buf, engine="openpyxl")
	if suf == ".parquet":
	return pd.read_parquet(buf)
	return pd.read_csv(buf, nrows=5_000_000 if sample else None)

	@st.cache_data(show_spinner=False)
	def list_sql_tables(uri: str) -> List[str]:
	return create_engine(uri).table_names()

	@st.cache_data(show_spinner=True)
	def fetch_sql_table(uri: str, tbl: str) -> pd.DataFrame:
	return pd.read_sql_table(tbl, create_engine(uri))

	@st.cache_data(show_spinner=False)
	def gemini(prompt: str) -> str:
	return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()

	# ───────────────────── 2 · DATA LOAD UI ─────────────────────
	st.title("📊 BizIntel AI Ultra — Gemini 1.5 Pro BI Copilot")
	mode = st.sidebar.radio("Data source", ["Upload file", "SQL DB"], horizontal=True)

	df: pd.DataFrame = pd.DataFrame()
	if mode == "Upload file":
	upl = st.sidebar.file_uploader("CSV / Excel / Parquet", ["csv", "xls", "xlsx", "parquet"], help="≤ 2 GB")
	sample = st.sidebar.checkbox("Sample first 5 M rows only")
	if upl:
	df = load_file(upl, sample)
	else:
	uri = st.sidebar.text_input("SQLAlchemy URI")
	if uri:
	tbl = st.sidebar.selectbox("Table", list_sql_tables(uri))
	if tbl:
	df = fetch_sql_table(uri, tbl)

	if df.empty:
	st.info("⬅️ Load data to start analysis.")
	st.stop()

	st.success("✅ Data loaded")
	st.dataframe(df.head(), use_container_width=True)

	# ───────────────────── 3 · OVERVIEW & GEMINI INSIGHT ─────────
	rows, cols = df.shape
	missing_pct = df.isna().sum().sum() / (rows * cols) * 100
	m1,m2,m3 = st.columns(3)
	m1.metric("Rows", f"{rows:,}")
	m2.metric("Columns", cols)
	m3.metric("Missing %", f"{missing_pct:.1f}")

	st.subheader("🧠 Gemini Insights")
	with st.spinner("Generating narrative…"):
	summ_json = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
	st.markdown(gemini(
	"You are a senior BI analyst. Provide five bullet insights and three actions for this dataset:\n\n" + summ_json
	))

	# ───────────────────── 4 · TIME‑SERIES PREP ──────────────────
	# attempt datetime coercion
	for col in df.columns:
	if not pd.api.types.is_datetime64_any_dtype(df[col]):
	try:
	df[col] = pd.to_datetime(df[col])
	except: pass

	DATE_COL = st.selectbox("Date column", [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])])
	NUM_COL = st.selectbox("Numeric metric", [c for c in df.select_dtypes("number").columns])

	ts = df[[DATE_COL, NUM_COL]].dropna().groupby(DATE_COL)[NUM_COL].mean().sort_index()
	fig_tr = px.line(ts, title=f"{NUM_COL} Trend", labels={"index":"Date", NUM_COL:NUM_COL})
	st.plotly_chart(fig_tr, use_container_width=True)

	# ───────────────────── 5 · ARIMA FORECAST ───────────────────
	st.subheader("🔮 ARIMA Forecast")
	steps = st.slider("Horizon (days)", 3, 365, 90)
	p = st.number_input("p", 0, 5, 1); d = st.number_input("d", 0, 2, 1); q = st.number_input("q", 0, 5, 1)
	try:
	model = ARIMA(ts, order=(p, d, q)).fit()
	future_idx = pd.date_range(ts.index[-1], periods=steps + 1, freq=pd.infer_freq(ts.index) or "D")[1:]
	forecast = pd.Series(model.forecast(steps), index=future_idx, name="Forecast")
	fig_fc = px.line(pd.concat([ts, forecast], axis=1), title="Actual vs Forecast")
	st.plotly_chart(fig_fc, use_container_width=True)
	except Exception as e:
	st.error(f"ARIMA failed: {e}")

	# ───────────────────── 6 · EDA EXPANDERS ────────────────────
	st.subheader("🔍 EDA Dashboard")
	with st.expander("Histogram + Box"):
	st.plotly_chart(px.histogram(df, x=NUM_COL, marginal="box", template="plotly_dark"), use_container_width=True)
	with st.expander("Correlation Heatmap"):
	corr = df.select_dtypes("number").corr()
	st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", title="Correlation"), use_container_width=True)

	# ───────────────────── 7 · STRATEGY BRIEF ───────────────────
	brief = (
	"# Strategy Brief\n"
	"• Clean missing date values for stable models.\n"
	"• Investigate high correlations for driver analysis.\n"
	"• Use forecast for inventory & workforce planning.\n"
	"• Monitor outliers (>3σ) weekly.\n"
	"• Segment by region/product for targeted actions."
	)
	st.download_button("⬇️ Download Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")