NH-Prediction

Running

File size: 9,166 Bytes

1acd6e1
 
 
 
 
 
 
 
dc2be38
1acd6e1
dc2be38
 
 
 
4a31bd8
4fb476c
 
 
 
 
dc2be38
 
 
828f0f0
 
 
3268778
4a31bd8
3268778
4a31bd8
3268778
 
 
828f0f0
3268778
 
 
828f0f0
4a31bd8
828f0f0
 
 
 
 
 
4a31bd8
 
 
 
 
828f0f0
 
 
 
4a31bd8
 
828f0f0
 
 
 
4a31bd8
828f0f0
 
 
4a31bd8
828f0f0
 
 
3268778
 
 
4a31bd8
1acd6e1
dc2be38
4a31bd8
dc2be38
 
 
 
 
3268778
 
 
 
4a31bd8
3268778
 
 
dc2be38
3268778
 
828f0f0
1acd6e1
 
 
4a31bd8
1acd6e1
 
 
 
4a31bd8
4fb476c
1acd6e1
 
 
3268778
 
1acd6e1
 
 
dc2be38
 
 
 
 
 
 
1acd6e1
dc2be38
1acd6e1
dc2be38
1acd6e1
dc2be38
1acd6e1
 
dc2be38
4a31bd8
dc2be38
 
 
3268778
4fb476c
 
1acd6e1
 
 
4fb476c
dc2be38
 
 
1acd6e1
3268778
4a31bd8
3268778
dc2be38
3268778
4fb476c
 
 
1acd6e1
 
 
dc2be38
 
 
1acd6e1
3268778
 
 
dc2be38
1acd6e1
 
3268778
4a31bd8
 
 
dc2be38
3268778
 
4a31bd8
dc2be38
762f595
 
 
3268778
 
 
dc2be38
4a31bd8
 
 
dc2be38
 
 
3268778
762f595

import streamlit as st
import pandas as pd
import numpy as np
from prophet import Prophet
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from pathlib import Path

# -------------------------------------------------
# CONFIG ------------------------------------------
# -------------------------------------------------
CSV_PATH = Path("price_data.csv")
PARQUET_PATH = Path("domae-202503.parquet")  # 1996‑2025‑03 일간/월간 가격
MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
MICRO_START, MICRO_END = "2020-01-01", "2026-12-31"

st.set_page_config(page_title="품목별 가격 예측", page_icon="📈", layout="wide")

# -------------------------------------------------
# UTILITIES ---------------------------------------
# -------------------------------------------------
DATE_CANDIDATES = {"date", "ds", "ymd", "날짜", "prce_reg_mm", "etl_ldg_dt"}
ITEM_CANDIDATES = {"item", "품목", "code", "category", "pdlt_nm", "spcs_nm"}
PRICE_CANDIDATES = {"price", "y", "value", "가격", "avrg_prce"}


def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize column names to date/item/price and deduplicate."""
    col_map = {}
    for c in df.columns:
        lc = c.lower()
        if lc in DATE_CANDIDATES:
            col_map[c] = "date"
        elif lc in PRICE_CANDIDATES:
            col_map[c] = "price"
        elif lc in ITEM_CANDIDATES:
            # first hit as item, second as species
            if "item" not in col_map.values():
                col_map[c] = "item"
            else:
                col_map[c] = "species"
    df = df.rename(columns=col_map)

    # ── handle duplicated columns after rename ─────────────────────────
    if df.columns.duplicated().any():
        df = df.loc[:, ~df.columns.duplicated()]

    # ── index datetime to column ───────────────────────────────────────
    if "date" not in df.columns and df.index.dtype.kind == "M":
        df.reset_index(inplace=True)
        df.rename(columns={df.columns[0]: "date"}, inplace=True)

    # ── convert YYYYMM string to datetime ──────────────────────────────
    if "date" in df.columns and pd.api.types.is_object_dtype(df["date" ]):
        sample = str(df["date"].iloc[0])
        if sample.isdigit() and len(sample) in (6, 8):
            df["date"] = pd.to_datetime(df["date"].astype(str).str[:6], format="%Y%m", errors="coerce")

    # ── build item from pdlt_nm + spcs_nm if needed ────────────────────
    if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
        df["item"] = df["pdlt_nm"].str.strip() + "-" + df["spcs_nm"].str.strip()

    # ── merge item + species ───────────────────────────────────────────
    if {"item", "species"}.issubset(df.columns):
        df["item"] = df["item"].astype(str).str.strip() + "-" + df["species"].astype(str).str.strip()
        df.drop(columns=["species"], inplace=True)

    return df


@st.cache_data(show_spinner=False)
def load_data() -> pd.DataFrame:
    """Load price data from Parquet if available, else CSV. Handle flexible schema."""
    if PARQUET_PATH.exists():
        df = pd.read_parquet(PARQUET_PATH)
    elif CSV_PATH.exists():
        df = pd.read_csv(CSV_PATH)
    else:
        st.error("💾 price_data.csv 또는 domae-202503.parquet 파일을 찾을 수 없습니다.")
        st.stop()

    df = _standardize_columns(df)

    missing = {c for c in ["date", "item", "price"] if c not in df.columns}
    if missing:
        st.error(f"필수 컬럼 누락: {', '.join(missing)} — 파일 컬럼명을 확인하세요.")
        st.stop()

    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["date", "item", "price"])
    df.sort_values("date", inplace=True)
    return df


@st.cache_data(show_spinner=False)
def get_items(df: pd.DataFrame):
    return sorted(df["item"].unique())


@st.cache_data(show_spinner=False)
def fit_prophet(df: pd.DataFrame, horizon_end: str):
    m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
    m.fit(df.rename(columns={"date": "ds", "price": "y"}))
    periods = (pd.Timestamp(horizon_end) - df["date"].max()).days
    future = m.make_future_dataframe(periods=periods, freq="D")
    forecast = m.predict(future)
    return m, forecast

# -------------------------------------------------
# LOAD DATA ---------------------------------------
# -------------------------------------------------
raw_df = load_data()

st.sidebar.header("🔍 품목 선택")
selected_item = st.sidebar.selectbox("품목", get_items(raw_df))
current_date = date.today()
st.sidebar.caption(f"오늘: {current_date}")

item_df = raw_df.query("item == @selected_item").copy()
if item_df.empty:
    st.error("선택한 품목 데이터 없음")
    st.stop()

# -------------------------------------------------
# MACRO FORECAST 1996‑2030 ------------------------
# -------------------------------------------------
st.header(f"📈 {selected_item} 가격 예측 대시보드")
macro_df = item_df[item_df["date"] >= MACRO_START]

m_macro, fc_macro = fit_prophet(macro_df, MACRO_END)
fig_macro = px.line(fc_macro, x="ds", y="yhat", title="Macro Forecast 1996–2030")
fig_macro.add_scatter(x=macro_df["date"], y=macro_df["price"], mode="lines", name="Actual")
st.plotly_chart(fig_macro, use_container_width=True)

latest_price = macro_df.iloc[-1]["price"]
macro_pred = fc_macro.loc[fc_macro["ds"] == MACRO_END, "yhat"].iloc[0]
macro_pct = (macro_pred - latest_price) / latest_price * 100
st.metric("2030 예측가", f"{macro_pred:,.0f}", f"{macro_pct:+.1f}%")

# -------------------------------------------------
# MICRO FORECAST 2024‑2026 ------------------------
# -------------------------------------------------
st.subheader("🔎 2024–2026 단기 예측")

micro_df = item_df[item_df["date"] >= MICRO_START]
m_micro, fc_micro = fit_prophet(micro_df, MICRO_END)
fig_micro = px.line(fc_micro, x="ds", y="yhat", title="Micro Forecast 2024–2026")
fig_micro.add_scatter(x=micro_df["date"], y=micro_df["price"], mode="lines", name="Actual")
st.plotly_chart(fig_micro, use_container_width=True)

micro_pred = fc_micro.loc[fc_micro["ds"] == MICRO_END, "yhat"].iloc[0]
micro_pct = (micro_pred - latest_price) / latest_price * 100
st.metric("2026 예측가", f"{micro_pred:,.0f}", f"{micro_pct:+.1f}%")

# -------------------------------------------------
# SEASONALITY & PATTERN ---------------------------
# -------------------------------------------------
with st.expander("📆 시즈널리티 & 패턴 설명"):
    comp_fig = m_micro.plot_components(fc_micro)
    st.pyplot(comp_fig)

    month_season = (fc_micro[["ds", "yearly"]]
                    .assign(month=lambda d: d.ds.dt.month)
                    .groupby("month")["yearly"].mean())
    st.markdown(
        f"**연간 피크 월:** {int(month_season.idxmax())}월  \n"
        f"**연간 저점 월:** {int(month_season.idxmin())}월  \n"
        f"**연간 변동폭:** {month_season.max() - month_season.min():.1f}")

# -------------------------------------------------
# CORRELATION HEATMAP -----------------------------
# -------------------------------------------------
# -------------------------------------------------
# CORRELATION HEATMAP -----------------------------
# -------------------------------------------------
st.subheader("🧮 품목 간 상관관계")
monthly_pivot = (raw_df.assign(month=lambda d: d.date.dt.to_period("M"))
                        .groupby(["month", "item"], as_index=False)["price"].mean()
                        .pivot(index="month", columns="item", values="price"))

corr = monthly_pivot.corr()
fig, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=False, cmap="coolwarm", center=0, 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Highlight correlations with selected item
if selected_item in corr.columns:
    item_corr = corr[selected_item].sort_values(ascending=False)
    top_corr = item_corr.drop(selected_item).head(5)
    bottom_corr = item_corr.drop(selected_item).tail(5)
    
    col1, col2 = st.columns(2)
    with col1:
        st.markdown(f"**{selected_item}와 상관관계 높은 품목**")
        for item, val in top_corr.items():
            st.write(f"{item}: {val:.2f}")
    with col2:
        st.markdown(f"**{selected_item}와 상관관계 낮은 품목**")
        for item, val in bottom_corr.items():
            st.write(f"{item}: {val:.2f}")

st.pyplot(fig)

# -------------------------------------------------
# FOOTER ------------------------------------------
# -------------------------------------------------
st.markdown("---")
st.caption("© 2024 품목별 가격 예측 시스템 | 데이터 분석 자동화")