Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from prophet import Prophet | |
import plotly.express as px | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from datetime import date | |
from pathlib import Path | |
import matplotlib.font_manager as fm | |
import matplotlib as mpl | |
# ------------------------------------------------- | |
# CONFIG ------------------------------------------ | |
# ------------------------------------------------- | |
CSV_PATH = Path("price_data.csv") | |
PARQUET_PATH = Path("domae-202503.parquet") | |
MACRO_START, MACRO_END = "1996-01-01", "2030-12-31" | |
MICRO_START, MICRO_END = "2020-01-01", "2026-12-31" | |
# ํ๊ธ ํฐํธ ์ค์ | |
# 1. ์์คํ ์ ์ค์น๋ ํ๊ธ ํฐํธ ์ฐพ๊ธฐ | |
font_list = [f.name for f in fm.fontManager.ttflist if 'gothic' in f.name.lower() or | |
'gulim' in f.name.lower() or 'malgun' in f.name.lower() or | |
'nanum' in f.name.lower() or 'batang' in f.name.lower()] | |
if font_list: | |
font_name = font_list[0] | |
plt.rcParams['font.family'] = font_name | |
mpl.rcParams['axes.unicode_minus'] = False | |
else: | |
# ํฐํธ๊ฐ ์์ ๊ฒฝ์ฐ ๊ธฐ๋ณธ ํฐํธ ์ค์ | |
plt.rcParams['font.family'] = 'DejaVu Sans' | |
st.set_page_config(page_title="ํ๋ชฉ๋ณ ๊ฐ๊ฒฉ ์์ธก", page_icon="๐", layout="wide") | |
# ------------------------------------------------- | |
# UTILITIES --------------------------------------- | |
# ------------------------------------------------- | |
DATE_CANDIDATES = {"date", "ds", "ymd", "๋ ์ง", "prce_reg_mm", "etl_ldg_dt"} | |
ITEM_CANDIDATES = {"item", "ํ๋ชฉ", "code", "category", "pdlt_nm", "spcs_nm"} | |
PRICE_CANDIDATES = {"price", "y", "value", "๊ฐ๊ฒฉ", "avrg_prce"} | |
def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame: | |
"""Standardize column names to date/item/price and deduplicate.""" | |
col_map = {} | |
for c in df.columns: | |
lc = c.lower() | |
if lc in DATE_CANDIDATES: | |
col_map[c] = "date" | |
elif lc in PRICE_CANDIDATES: | |
col_map[c] = "price" | |
elif lc in ITEM_CANDIDATES: | |
# first hit as item, second as species | |
if "item" not in col_map.values(): | |
col_map[c] = "item" | |
else: | |
col_map[c] = "species" | |
df = df.rename(columns=col_map) | |
# โโ handle duplicated columns after rename โโโโโโโโโโโโโโโโโโโโโโโโโ | |
if df.columns.duplicated().any(): | |
df = df.loc[:, ~df.columns.duplicated()] | |
# โโ index datetime to column โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if "date" not in df.columns and df.index.dtype.kind == "M": | |
df.reset_index(inplace=True) | |
df.rename(columns={df.columns[0]: "date"}, inplace=True) | |
# โโ convert YYYYMM string to datetime โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if "date" in df.columns and pd.api.types.is_object_dtype(df["date"]): | |
if len(df) > 0: # ๋ฐ์ดํฐ๊ฐ ์๋์ง ํ์ธ | |
sample = str(df["date"].iloc[0]) | |
if sample.isdigit() and len(sample) in (6, 8): | |
df["date"] = pd.to_datetime(df["date"].astype(str).str[:6], format="%Y%m", errors="coerce") | |
# โโ build item from pdlt_nm + spcs_nm if needed โโโโโโโโโโโโโโโโโโโโ | |
if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns): | |
df["item"] = df["pdlt_nm"].str.strip() + "-" + df["spcs_nm"].str.strip() | |
# โโ merge item + species โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if {"item", "species"}.issubset(df.columns): | |
df["item"] = df["item"].astype(str).str.strip() + "-" + df["species"].astype(str).str.strip() | |
df.drop(columns=["species"], inplace=True) | |
return df | |
def load_data() -> pd.DataFrame: | |
"""Load price data from Parquet if available, else CSV. Handle flexible schema.""" | |
try: | |
if PARQUET_PATH.exists(): | |
st.sidebar.info("Parquet ํ์ผ์์ ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฌ์ต๋๋ค.") | |
df = pd.read_parquet(PARQUET_PATH) | |
st.sidebar.success(f"Parquet ๋ฐ์ดํฐ ๋ก๋ ์๋ฃ: {len(df)}๊ฐ ํ") | |
elif CSV_PATH.exists(): | |
st.sidebar.info("CSV ํ์ผ์์ ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฌ์ต๋๋ค.") | |
df = pd.read_csv(CSV_PATH) | |
st.sidebar.success(f"CSV ๋ฐ์ดํฐ ๋ก๋ ์๋ฃ: {len(df)}๊ฐ ํ") | |
else: | |
st.error("๐พ price_data.csv ๋๋ domae-202503.parquet ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค.") | |
st.stop() | |
# ์๋ณธ ๋ฐ์ดํฐ ํํ ํ์ธ | |
st.sidebar.write("์๋ณธ ๋ฐ์ดํฐ ์ปฌ๋ผ:", list(df.columns)) | |
df = _standardize_columns(df) | |
st.sidebar.write("ํ์คํ ํ ์ปฌ๋ผ:", list(df.columns)) | |
missing = {c for c in ["date", "item", "price"] if c not in df.columns} | |
if missing: | |
st.error(f"ํ์ ์ปฌ๋ผ ๋๋ฝ: {', '.join(missing)} โ ํ์ผ ์ปฌ๋ผ๋ช ์ ํ์ธํ์ธ์.") | |
st.stop() | |
# ๋ ์ง ๋ณํ ์ ํ ๋ฐ์ดํฐ ์ ํ์ธ | |
before_date_convert = len(df) | |
df["date"] = pd.to_datetime(df["date"], errors="coerce") | |
after_date_convert = df.dropna(subset=["date"]).shape[0] | |
if before_date_convert != after_date_convert: | |
st.warning(f"๋ ์ง ๋ณํ ์ค {before_date_convert - after_date_convert}๊ฐ ํ์ด ์ ์ธ๋์์ต๋๋ค.") | |
# NA ๋ฐ์ดํฐ ์ฒ๋ฆฌ | |
before_na_drop = len(df) | |
df = df.dropna(subset=["date", "item", "price"]) | |
after_na_drop = len(df) | |
if before_na_drop != after_na_drop: | |
st.warning(f"NA ์ ๊ฑฐ ์ค {before_na_drop - after_na_drop}๊ฐ ํ์ด ์ ์ธ๋์์ต๋๋ค.") | |
df.sort_values("date", inplace=True) | |
# ๋ฐ์ดํฐ ๋ ์ง ๋ฒ์ ํ์ธ | |
if len(df) > 0: | |
st.sidebar.write(f"๋ฐ์ดํฐ ๋ ์ง ๋ฒ์: {df['date'].min().strftime('%Y-%m-%d')} ~ {df['date'].max().strftime('%Y-%m-%d')}") | |
st.sidebar.write(f"์ด ํ๋ชฉ ์: {df['item'].nunique()}") | |
else: | |
st.error("์ ํจํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค!") | |
return df | |
except Exception as e: | |
st.error(f"๋ฐ์ดํฐ ๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
st.stop() | |
def get_items(df: pd.DataFrame): | |
return sorted(df["item"].unique()) | |
def fit_prophet(df: pd.DataFrame, horizon_end: str): | |
# Make a copy and ensure we have data | |
df = df.copy() | |
df = df.dropna(subset=["date", "price"]) | |
# ์ค๋ณต ๋ ์ง ์ฒ๋ฆฌ - ๋์ผ ๋ ์ง์ ์ฌ๋ฌ ๊ฐ์ด ์์ผ๋ฉด ํ๊ท ๊ฐ ์ฌ์ฉ | |
df = df.groupby("date")["price"].mean().reset_index() | |
if len(df) < 2: | |
st.warning(f"๋ฐ์ดํฐ ํฌ์ธํธ๊ฐ ๋ถ์กฑํฉ๋๋ค. ์์ธก์ ์ํด์๋ ์ต์ 2๊ฐ ์ด์์ ์ ํจ ๋ฐ์ดํฐ๊ฐ ํ์ํฉ๋๋ค. (ํ์ฌ {len(df)}๊ฐ)") | |
return None, None | |
# Convert to Prophet format | |
prophet_df = df.rename(columns={"date": "ds", "price": "y"}) | |
try: | |
# Fit the model | |
m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False) | |
m.fit(prophet_df) | |
# Generate future dates | |
periods = max((pd.Timestamp(horizon_end) - df["date"].max()).days, 1) | |
future = m.make_future_dataframe(periods=periods, freq="D") | |
# Make predictions | |
forecast = m.predict(future) | |
return m, forecast | |
except Exception as e: | |
st.error(f"Prophet ๋ชจ๋ธ ์์ฑ ์ค ์ค๋ฅ: {str(e)}") | |
return None, None | |
# ------------------------------------------------- | |
# LOAD DATA --------------------------------------- | |
# ------------------------------------------------- | |
raw_df = load_data() | |
if len(raw_df) == 0: | |
st.error("๋ฐ์ดํฐ๊ฐ ๋น์ด ์์ต๋๋ค. ํ์ผ์ ํ์ธํด์ฃผ์ธ์.") | |
st.stop() | |
st.sidebar.header("๐ ํ๋ชฉ ์ ํ") | |
selected_item = st.sidebar.selectbox("ํ๋ชฉ", get_items(raw_df)) | |
current_date = date.today() | |
st.sidebar.caption(f"์ค๋: {current_date}") | |
item_df = raw_df.query("item == @selected_item").copy() | |
if item_df.empty: | |
st.error("์ ํํ ํ๋ชฉ ๋ฐ์ดํฐ ์์") | |
st.stop() | |
# ------------------------------------------------- | |
# MACRO FORECAST 1996โ2030 ------------------------ | |
# ------------------------------------------------- | |
st.header(f"๐ {selected_item} ๊ฐ๊ฒฉ ์์ธก ๋์๋ณด๋") | |
# ๋ฐ์ดํฐ ํํฐ๋ง ๋ก์ง ๊ฐ์ - ์๊ฐ ๋ฒ์๋ฅผ ์กฐ์ ํ์ฌ ๋ ๋ง์ ๋ฐ์ดํฐ ํฌํจ | |
try: | |
macro_start_dt = pd.Timestamp(MACRO_START) | |
# ๋ฐ์ดํฐ๊ฐ ์ถฉ๋ถํ์ง ์์ผ๋ฉด ์์ ๋ ์ง๋ฅผ ์กฐ์ | |
if len(item_df[item_df["date"] >= macro_start_dt]) < 10: | |
# ๊ฐ์ฅ ์ค๋๋ ๋ ์ง๋ถํฐ ์์ | |
macro_start_dt = item_df["date"].min() | |
st.info(f"์ถฉ๋ถํ ๋ฐ์ดํฐ๊ฐ ์์ด ์์ ๋ ์ง๋ฅผ {macro_start_dt.strftime('%Y-%m-%d')}๋ก ์กฐ์ ํ์ต๋๋ค.") | |
macro_df = item_df[item_df["date"] >= macro_start_dt].copy() | |
except Exception as e: | |
st.error(f"๋ ์ง ํํฐ๋ง ์ค๋ฅ: {str(e)}") | |
macro_df = item_df.copy() # ํํฐ๋ง ์์ด ์ ์ฒด ๋ฐ์ดํฐ ์ฌ์ฉ | |
# Add diagnostic info | |
with st.expander("๋ฐ์ดํฐ ์ง๋จ"): | |
st.write(f"- ์ ์ฒด ๋ฐ์ดํฐ ์: {len(item_df)}") | |
st.write(f"- ๋ถ์ ๋ฐ์ดํฐ ์: {len(macro_df)}") | |
if len(macro_df) > 0: | |
st.write(f"- ๊ธฐ๊ฐ: {macro_df['date'].min().strftime('%Y-%m-%d')} ~ {macro_df['date'].max().strftime('%Y-%m-%d')}") | |
st.dataframe(macro_df.head()) | |
else: | |
st.write("๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.") | |
if len(macro_df) < 2: | |
st.warning(f"{selected_item}์ ๋ํ ๋ฐ์ดํฐ๊ฐ ์ถฉ๋ถํ์ง ์์ต๋๋ค. ์ ์ฒด ๊ธฐ๊ฐ ๋ฐ์ดํฐ๋ฅผ ํ์ํฉ๋๋ค.") | |
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} ๊ณผ๊ฑฐ ๊ฐ๊ฒฉ") | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
try: | |
with st.spinner("์ฅ๊ธฐ ์์ธก ๋ชจ๋ธ ์์ฑ ์ค..."): | |
m_macro, fc_macro = fit_prophet(macro_df, MACRO_END) | |
if m_macro is not None and fc_macro is not None: | |
fig_macro = px.line(fc_macro, x="ds", y="yhat", title="์ฅ๊ธฐ ์์ธก (1996โ2030)") | |
fig_macro.add_scatter(x=macro_df["date"], y=macro_df["price"], mode="lines", name="์ค์ ๊ฐ๊ฒฉ") | |
st.plotly_chart(fig_macro, use_container_width=True) | |
latest_price = macro_df.iloc[-1]["price"] | |
# 2030๋ ๋ง์ง๋ง ๋ ์ฐพ๊ธฐ | |
target_date = pd.Timestamp(MACRO_END) | |
close_dates = fc_macro.loc[(fc_macro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0] | |
macro_pred = fc_macro.loc[fc_macro["ds"] == close_dates, "yhat"].iloc[0] | |
macro_pct = (macro_pred - latest_price) / latest_price * 100 | |
st.metric("2030 ์์ธก๊ฐ", f"{macro_pred:,.0f}", f"{macro_pct:+.1f}%") | |
else: | |
st.warning("์์ธก ๋ชจ๋ธ์ ์์ฑํ ์ ์์ต๋๋ค.") | |
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} ๊ณผ๊ฑฐ ๊ฐ๊ฒฉ") | |
st.plotly_chart(fig, use_container_width=True) | |
except Exception as e: | |
st.error(f"์ฅ๊ธฐ ์์ธก ์ค๋ฅ ๋ฐ์: {str(e)}") | |
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} ๊ณผ๊ฑฐ ๊ฐ๊ฒฉ") | |
st.plotly_chart(fig, use_container_width=True) | |
# ------------------------------------------------- | |
# MICRO FORECAST 2024โ2026 ------------------------ | |
# ------------------------------------------------- | |
st.subheader("๐ 2024โ2026 ๋จ๊ธฐ ์์ธก") | |
# ๋ฐ์ดํฐ ํํฐ๋ง ๋ก์ง ๊ฐ์ | |
try: | |
micro_start_dt = pd.Timestamp(MICRO_START) | |
# ๋ฐ์ดํฐ๊ฐ ์ถฉ๋ถํ์ง ์์ผ๋ฉด ์์ ๋ ์ง๋ฅผ ์กฐ์ | |
if len(item_df[item_df["date"] >= micro_start_dt]) < 10: | |
# ์ต๊ทผ 30% ๋ฐ์ดํฐ๋ง ์ฌ์ฉ | |
n = max(2, int(len(item_df) * 0.3)) | |
micro_df = item_df.sort_values("date").tail(n).copy() | |
st.info(f"์ถฉ๋ถํ ์ต๊ทผ ๋ฐ์ดํฐ๊ฐ ์์ด ์ต๊ทผ {n}๊ฐ ๋ฐ์ดํฐ ํฌ์ธํธ๋ง ์ฌ์ฉํฉ๋๋ค.") | |
else: | |
micro_df = item_df[item_df["date"] >= micro_start_dt].copy() | |
except Exception as e: | |
st.error(f"๋จ๊ธฐ ์์ธก ๋ฐ์ดํฐ ํํฐ๋ง ์ค๋ฅ: {str(e)}") | |
# ์ต๊ทผ 10๊ฐ ๋ฐ์ดํฐ ํฌ์ธํธ ์ฌ์ฉ | |
micro_df = item_df.sort_values("date").tail(10).copy() | |
if len(micro_df) < 2: | |
st.warning(f"{MICRO_START} ์ดํ ๋ฐ์ดํฐ๊ฐ ์ถฉ๋ถํ์ง ์์ต๋๋ค.") | |
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} ์ต๊ทผ ๊ฐ๊ฒฉ") | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
try: | |
with st.spinner("๋จ๊ธฐ ์์ธก ๋ชจ๋ธ ์์ฑ ์ค..."): | |
m_micro, fc_micro = fit_prophet(micro_df, MICRO_END) | |
if m_micro is not None and fc_micro is not None: | |
fig_micro = px.line(fc_micro, x="ds", y="yhat", title="๋จ๊ธฐ ์์ธก (2024โ2026)") | |
fig_micro.add_scatter(x=micro_df["date"], y=micro_df["price"], mode="lines", name="์ค์ ๊ฐ๊ฒฉ") | |
st.plotly_chart(fig_micro, use_container_width=True) | |
latest_price = micro_df.iloc[-1]["price"] | |
target_date = pd.Timestamp(MICRO_END) | |
close_dates = fc_micro.loc[(fc_micro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0] | |
micro_pred = fc_micro.loc[fc_micro["ds"] == close_dates, "yhat"].iloc[0] | |
micro_pct = (micro_pred - latest_price) / latest_price * 100 | |
st.metric("2026 ์์ธก๊ฐ", f"{micro_pred:,.0f}", f"{micro_pct:+.1f}%") | |
else: | |
st.warning("๋จ๊ธฐ ์์ธก ๋ชจ๋ธ์ ์์ฑํ ์ ์์ต๋๋ค.") | |
except Exception as e: | |
st.error(f"๋จ๊ธฐ ์์ธก ์ค๋ฅ: {str(e)}") | |
# ------------------------------------------------- | |
# SEASONALITY & PATTERN --------------------------- | |
# ------------------------------------------------- | |
with st.expander("๐ ์์ฆ๋๋ฆฌํฐ & ํจํด ์ค๋ช "): | |
if 'm_micro' in locals() and m_micro is not None and 'fc_micro' in locals() and fc_micro is not None: | |
try: | |
comp_fig = m_micro.plot_components(fc_micro) | |
st.pyplot(comp_fig) | |
month_season = (fc_micro[["ds", "yearly"]] | |
.assign(month=lambda d: d.ds.dt.month) | |
.groupby("month")["yearly"].mean()) | |
st.markdown( | |
f"**์ฐ๊ฐ ํผํฌ ์:** {int(month_season.idxmax())}์ \n" | |
f"**์ฐ๊ฐ ์ ์ ์:** {int(month_season.idxmin())}์ \n" | |
f"**์ฐ๊ฐ ๋ณ๋ํญ:** {month_season.max() - month_season.min():.1f}") | |
except Exception as e: | |
st.error(f"์์ฆ๋๋ฆฌํฐ ๋ถ์ ์ค๋ฅ: {str(e)}") | |
else: | |
st.info("ํจํด ๋ถ์์ ์ํ ์ถฉ๋ถํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.") | |
# ------------------------------------------------- | |
# CORRELATION HEATMAP ----------------------------- | |
# ------------------------------------------------- | |
st.subheader("๐งฎ ํ๋ชฉ ๊ฐ ์๊ด๊ด๊ณ") | |
try: | |
# ๋๋ฌด ๋ง์ ํ๋ชฉ์ด ์์ผ๋ฉด ์์ N๊ฐ๋ง ์ ํ | |
items_to_corr = raw_df['item'].value_counts().head(30).index.tolist() | |
if selected_item not in items_to_corr and selected_item in raw_df['item'].unique(): | |
items_to_corr.append(selected_item) | |
filtered_df = raw_df[raw_df['item'].isin(items_to_corr)] | |
monthly_pivot = (filtered_df.assign(month=lambda d: d.date.dt.to_period("M")) | |
.groupby(["month", "item"], as_index=False)["price"].mean() | |
.pivot(index="month", columns="item", values="price")) | |
# ๊ฒฐ์ธก์น๊ฐ ๋๋ฌด ๋ง์ ์ด ์ ๊ฑฐ | |
threshold = 0.5 # 50% ์ด์ ๊ฒฐ์ธก์น๊ฐ ์๋ ์ด ์ ๊ฑฐ | |
monthly_pivot = monthly_pivot.loc[:, monthly_pivot.isnull().mean() < threshold] | |
if monthly_pivot.shape[1] > 1: # At least 2 items needed for correlation | |
# ๊ฒฐ์ธก์น ์ฒ๋ฆฌ | |
monthly_pivot = monthly_pivot.fillna(method='ffill').fillna(method='bfill') | |
# ์๊ด๊ด๊ณ ๊ณ์ฐ | |
corr = monthly_pivot.corr() | |
# ์๊ฐํ | |
fig, ax = plt.subplots(figsize=(12, 10)) | |
mask = np.triu(np.ones_like(corr, dtype=bool)) | |
# ์ฌ๊ธฐ์ ํฐํธ ์ค์ ๋ค์ ํ์ธ | |
plt.title(f"{selected_item} ๊ด๋ จ ์๊ด๊ด๊ณ", fontsize=15) | |
sns.heatmap(corr, mask=mask, annot=False, cmap="coolwarm", center=0, | |
square=True, linewidths=.5, cbar_kws={"shrink": .5}) | |
plt.xticks(rotation=45, ha='right', fontsize=8) | |
plt.yticks(fontsize=8) | |
# Highlight correlations with selected item | |
if selected_item in corr.columns: | |
item_corr = corr[selected_item].sort_values(ascending=False) | |
top_corr = item_corr.drop(selected_item).head(5) | |
bottom_corr = item_corr.drop(selected_item).tail(5) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown(f"**{selected_item}์ ์๊ด๊ด๊ณ ๋์ ํ๋ชฉ**") | |
for item, val in top_corr.items(): | |
st.write(f"{item}: {val:.2f}") | |
with col2: | |
st.markdown(f"**{selected_item}์ ์๊ด๊ด๊ณ ๋ฎ์ ํ๋ชฉ**") | |
for item, val in bottom_corr.items(): | |
st.write(f"{item}: {val:.2f}") | |
st.pyplot(fig) | |
else: | |
st.info("์๊ด๊ด๊ณ ๋ถ์์ ์ํ ์ถฉ๋ถํ ํ๋ชฉ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.") | |
except Exception as e: | |
st.error(f"์๊ด๊ด๊ณ ๋ถ์ ์ค๋ฅ: {str(e)}") | |
st.write("์ค๋ฅ ์์ธ ์ ๋ณด:", str(e)) | |
# ------------------------------------------------- | |
# FOOTER ------------------------------------------ | |
# ------------------------------------------------- | |
st.markdown("---") | |
st.caption("ยฉ 2025 ํ๋ชฉ๋ณ ๊ฐ๊ฒฉ ์์ธก ์์คํ | ๋ฐ์ดํฐ ๋ถ์ ์๋ํ") |