NH-Prediction

Running

App Files Files Community

yokoha commited on Apr 30

Commit

4a31bd8

verified ·

1 Parent(s): 828f0f0

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -39

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -13,7 +12,7 @@ from pathlib import Path
 # CONFIG ------------------------------------------
 # -------------------------------------------------
 CSV_PATH = Path("price_data.csv")
-PARQUET_PATH = Path("domae-202503.parquet")  # 1996-2025-03 일간/월간 가격
 MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
 MICRO_START, MICRO_END = "2020-01-01", "2026-12-31"
@@ -26,8 +25,9 @@ DATE_CANDIDATES = {"date", "ds", "ymd", "날짜", "prce_reg_mm", "etl_ldg_dt"}
 ITEM_CANDIDATES = {"item", "품목", "code", "category", "pdlt_nm", "spcs_nm"}
 PRICE_CANDIDATES = {"price", "y", "value", "가격", "avrg_prce"}
 def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
-    """Rename date/item/price columns to date, item, price. Create composite item if needed."""
     col_map = {}
     for c in df.columns:
         lc = c.lower()
@@ -36,32 +36,43 @@ def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
         elif lc in PRICE_CANDIDATES:
             col_map[c] = "price"
         elif lc in ITEM_CANDIDATES:
             if "item" not in col_map.values():
                 col_map[c] = "item"
             else:
                 col_map[c] = "species"
     df = df.rename(columns=col_map)
     if "date" not in df.columns and df.index.dtype.kind == "M":
         df.reset_index(inplace=True)
         df.rename(columns={df.columns[0]: "date"}, inplace=True)
-    if "date" in df.columns and df["date"].dtype == object:
         sample = str(df["date"].iloc[0])
         if sample.isdigit() and len(sample) in (6, 8):
             df["date"] = pd.to_datetime(df["date"].astype(str).str[:6], format="%Y%m", errors="coerce")
     if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
         df["item"] = df["pdlt_nm"].str.strip() + "-" + df["spcs_nm"].str.strip()
     if {"item", "species"}.issubset(df.columns):
         df["item"] = df["item"].astype(str).str.strip() + "-" + df["species"].astype(str).str.strip()
         df.drop(columns=["species"], inplace=True)
     return df
 @st.cache_data(show_spinner=False)
 def load_data() -> pd.DataFrame:
     if PARQUET_PATH.exists():
         df = pd.read_parquet(PARQUET_PATH)
     elif CSV_PATH.exists():
@@ -71,6 +82,7 @@ def load_data() -> pd.DataFrame:
         st.stop()
     df = _standardize_columns(df)
     missing = {c for c in ["date", "item", "price"] if c not in df.columns}
     if missing:
         st.error(f"필수 컬럼 누락: {', '.join(missing)} — 파일 컬럼명을 확인하세요.")
@@ -81,10 +93,12 @@ def load_data() -> pd.DataFrame:
     df.sort_values("date", inplace=True)
     return df
 @st.cache_data(show_spinner=False)
 def get_items(df: pd.DataFrame):
     return sorted(df["item"].unique())
 @st.cache_data(show_spinner=False)
 def fit_prophet(df: pd.DataFrame, horizon_end: str):
     m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
@@ -110,7 +124,7 @@ if item_df.empty:
     st.stop()
 # -------------------------------------------------
-# MACRO FORECAST 1996-2030 ------------------------
 # -------------------------------------------------
 st.header(f"📈 {selected_item} 가격 예측 대시보드")
 macro_df = item_df[item_df["date"] >= MACRO_START]
@@ -126,7 +140,7 @@ macro_pct = (macro_pred - latest_price) / latest_price * 100
 st.metric("2030 예측가", f"{macro_pred:,.0f}", f"{macro_pct:+.1f}%")
 # -------------------------------------------------
-# MICRO FORECAST 2024-2026 ------------------------
 # -------------------------------------------------
 st.subheader("🔎 2024–2026 단기 예측")
@@ -147,48 +161,23 @@ with st.expander("📆 시즈널리티 & 패턴 설명"):
     comp_fig = m_micro.plot_components(fc_micro)
     st.pyplot(comp_fig)
-    month_season = (
-        fc_micro[["ds", "yearly"]]
-        .assign(month=lambda d: d.ds.dt.month)
-        .groupby("month")["yearly"].mean()
-    )
     st.markdown(
         f"**연간 피크 월:** {int(month_season.idxmax())}월  \n"
         f"**연간 저점 월:** {int(month_season.idxmin())}월  \n"
-        f"**연간 변동폭:** {month_season.max() - month_season.min():.1f}"
-    )
 # -------------------------------------------------
 # CORRELATION HEATMAP -----------------------------
 # -------------------------------------------------
 st.subheader("🧮 품목 간 상관관계")
-monthly_pivot = (
-    raw_df.assign(month=lambda d: d.date.dt.to_period("M"))
-    .groupby(["month", "item"], as_index=False)["price"]
-    .mean()
-    .pivot(index="month", columns="item", values="price")
-)
 corr = monthly_pivot.corr()
 fig, ax = plt.subplots(figsize=(12, 10))
 mask = np.triu(np.ones_like(corr, dtype=bool))
-sns.heatmap(corr, mask=mask, cmap="RdBu_r", center=0, linewidths=.5, ax=ax)
-st.pyplot(fig)
-st.info("빨간 영역: 가격 동조화 / 파란 영역: 대체재 가능성")
-# -------------------------------------------------
-# VOLATILITY --------------------------------------
-# -------------------------------------------------
-st.subheader("📊 30일 이동 표준편차 (가격 변동성)")
-vol = (
-    item_df.set_index("date")["price"]
-    .rolling(30)
-    .std()
-    .dropna()
-    .reset_index()
-)
-fig_vol = px.area(vol, x="date", y="price", title="Rolling 30D Std Dev")
-st.plotly_chart(fig_vol, use_container_width=True)
-st.caption("데이터: domae-202503.parquet · Prophet 예측 · Streamlit 대시보드")

 import streamlit as st
 import pandas as pd
 import numpy as np
 # CONFIG ------------------------------------------
 # -------------------------------------------------
 CSV_PATH = Path("price_data.csv")
+PARQUET_PATH = Path("domae-202503.parquet")  # 1996‑2025‑03 일간/월간 가격
 MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
 MICRO_START, MICRO_END = "2020-01-01", "2026-12-31"
 ITEM_CANDIDATES = {"item", "품목", "code", "category", "pdlt_nm", "spcs_nm"}
 PRICE_CANDIDATES = {"price", "y", "value", "가격", "avrg_prce"}
 def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Standardize column names to date/item/price and deduplicate."""
     col_map = {}
     for c in df.columns:
         lc = c.lower()
         elif lc in PRICE_CANDIDATES:
             col_map[c] = "price"
         elif lc in ITEM_CANDIDATES:
+            # first hit as item, second as species
             if "item" not in col_map.values():
                 col_map[c] = "item"
             else:
                 col_map[c] = "species"
     df = df.rename(columns=col_map)
+    # ── handle duplicated columns after rename ─────────────────────────
+    if df.columns.duplicated().any():
+        df = df.loc[:, ~df.columns.duplicated()]
+    # ── index datetime to column ───────────────────────────────────────
     if "date" not in df.columns and df.index.dtype.kind == "M":
         df.reset_index(inplace=True)
         df.rename(columns={df.columns[0]: "date"}, inplace=True)
+    # ── convert YYYYMM string to datetime ──────────────────────────────
+    if "date" in df.columns and pd.api.types.is_object_dtype(df["date" ]):
         sample = str(df["date"].iloc[0])
         if sample.isdigit() and len(sample) in (6, 8):
             df["date"] = pd.to_datetime(df["date"].astype(str).str[:6], format="%Y%m", errors="coerce")
+    # ── build item from pdlt_nm + spcs_nm if needed ────────────────────
     if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
         df["item"] = df["pdlt_nm"].str.strip() + "-" + df["spcs_nm"].str.strip()
+    # ── merge item + species ───────────────────────────────────────────
     if {"item", "species"}.issubset(df.columns):
         df["item"] = df["item"].astype(str).str.strip() + "-" + df["species"].astype(str).str.strip()
         df.drop(columns=["species"], inplace=True)
     return df
 @st.cache_data(show_spinner=False)
 def load_data() -> pd.DataFrame:
+    """Load price data from Parquet if available, else CSV. Handle flexible schema."""
     if PARQUET_PATH.exists():
         df = pd.read_parquet(PARQUET_PATH)
     elif CSV_PATH.exists():
         st.stop()
     df = _standardize_columns(df)
     missing = {c for c in ["date", "item", "price"] if c not in df.columns}
     if missing:
         st.error(f"필수 컬럼 누락: {', '.join(missing)} — 파일 컬럼명을 확인하세요.")
     df.sort_values("date", inplace=True)
     return df
 @st.cache_data(show_spinner=False)
 def get_items(df: pd.DataFrame):
     return sorted(df["item"].unique())
 @st.cache_data(show_spinner=False)
 def fit_prophet(df: pd.DataFrame, horizon_end: str):
     m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
     st.stop()
 # -------------------------------------------------
+# MACRO FORECAST 1996‑2030 ------------------------
 # -------------------------------------------------
 st.header(f"📈 {selected_item} 가격 예측 대시보드")
 macro_df = item_df[item_df["date"] >= MACRO_START]
 st.metric("2030 예측가", f"{macro_pred:,.0f}", f"{macro_pct:+.1f}%")
 # -------------------------------------------------
+# MICRO FORECAST 2024‑2026 ------------------------
 # -------------------------------------------------
 st.subheader("🔎 2024–2026 단기 예측")
     comp_fig = m_micro.plot_components(fc_micro)
     st.pyplot(comp_fig)
+    month_season = (fc_micro[["ds", "yearly"]]
+                    .assign(month=lambda d: d.ds.dt.month)
+                    .groupby("month")["yearly"].mean())
     st.markdown(
         f"**연간 피크 월:** {int(month_season.idxmax())}월  \n"
         f"**연간 저점 월:** {int(month_season.idxmin())}월  \n"
+        f"**연간 변동폭:** {month_season.max() - month_season.min():.1f}")
 # -------------------------------------------------
 # CORRELATION HEATMAP -----------------------------
 # -------------------------------------------------
 st.subheader("🧮 품목 간 상관관계")
+monthly_pivot = (raw_df.assign(month=lambda d: d.date.dt.to_period("M"))
+                        .groupby(["month", "item"], as_index=False)["price"].mean()
+                        .pivot(index="month", columns="item", values="price"))
 corr = monthly_pivot.corr()
 fig, ax = plt.subplots(figsize=(12, 10))
 mask = np.triu(np.ones_like(corr, dtype=bool))
+sns.heatmap(c