NH-Prediction / app.py
yokoha's picture
Update app.py
0b12519 verified
raw
history blame
14.7 kB
import streamlit as st
import pandas as pd
import numpy as np
from prophet import Prophet
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import date
from pathlib import Path
import matplotlib.font_manager as fm
import matplotlib as mpl
# -------------------------------------------------
# CONFIG ------------------------------------------
# -------------------------------------------------
CSV_PATH = Path("2025-domae.csv") # 파일 경둜 μˆ˜μ •
MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
MICRO_START, MICRO_END = "2020-01-01", "2026-12-31"
# ν•œκΈ€ 폰트 μ„€μ •
font_list = [f.name for f in fm.fontManager.ttflist if 'gothic' in f.name.lower() or
'gulim' in f.name.lower() or 'malgun' in f.name.lower() or
'nanum' in f.name.lower() or 'batang' in f.name.lower()]
if font_list:
font_name = font_list[0]
plt.rcParams['font.family'] = font_name
mpl.rcParams['axes.unicode_minus'] = False
else:
plt.rcParams['font.family'] = 'DejaVu Sans'
st.set_page_config(page_title="ν’ˆλͺ©λ³„ 가격 예츑", page_icon="πŸ“ˆ", layout="wide")
# -------------------------------------------------
# UTILITIES ---------------------------------------
# -------------------------------------------------
DATE_CANDIDATES = {"date", "ds", "ymd", "λ‚ μ§œ", "prce_reg_mm", "etl_ldg_dt"}
ITEM_CANDIDATES = {"item", "ν’ˆλͺ©", "code", "category", "pdlt_nm", "spcs_nm"}
PRICE_CANDIDATES = {"price", "y", "value", "가격", "avrg_prce"}
def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Standardize column names to date/item/price and deduplicate."""
col_map = {}
for c in df.columns:
lc = c.lower()
if lc in DATE_CANDIDATES:
col_map[c] = "date"
elif lc in PRICE_CANDIDATES:
col_map[c] = "price"
elif lc in ITEM_CANDIDATES:
# first hit as item, second as species
if "item" not in col_map.values():
col_map[c] = "item"
else:
col_map[c] = "species"
df = df.rename(columns=col_map)
# ── handle duplicated columns after rename ─────────────────────────
if df.columns.duplicated().any():
df = df.loc[:, ~df.columns.duplicated()]
# ── index datetime to column ───────────────────────────────────────
if "date" not in df.columns and df.index.dtype.kind == "M":
df.reset_index(inplace=True)
df.rename(columns={df.columns[0]: "date"}, inplace=True)
# ── convert YYYYMM string to datetime ──────────────────────────────
if "date" in df.columns and pd.api.types.is_object_dtype(df["date"]):
if len(df) > 0:
sample = str(df["date"].iloc[0])
if sample.isdigit() and len(sample) in (6, 8):
df["date"] = pd.to_datetime(df["date"].astype(str).str[:6], format="%Y%m", errors="coerce")
# ── build item from pdlt_nm + spcs_nm if needed ────────────────────
if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
df["item"] = df["pdlt_nm"].str.strip() + "-" + df["spcs_nm"].str.strip()
# ── merge item + species ───────────────────────────────────────────
if {"item", "species"}.issubset(df.columns):
df["item"] = df["item"].astype(str).str.strip() + "-" + df["species"].astype(str).str.strip()
df.drop(columns=["species"], inplace=True)
return df
@st.cache_data(show_spinner=False)
def load_data() -> pd.DataFrame:
"""Load price data from CSV file."""
try:
if not CSV_PATH.exists():
st.error(f"πŸ’Ύ {CSV_PATH} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
st.stop()
st.sidebar.info(f"{CSV_PATH} νŒŒμΌμ—μ„œ 데이터λ₯Ό λΆˆλŸ¬μ˜΅λ‹ˆλ‹€.")
# CSV 파일 직접 λ‘œλ“œ
df = pd.read_csv(CSV_PATH)
st.sidebar.success(f"CSV 데이터 λ‘œλ“œ μ™„λ£Œ: {len(df)}개 ν–‰")
# 원본 데이터 ν˜•νƒœ 확인
st.sidebar.write("원본 데이터 컬럼:", list(df.columns))
df = _standardize_columns(df)
st.sidebar.write("ν‘œμ€€ν™” ν›„ 컬럼:", list(df.columns))
missing = {c for c in ["date", "item", "price"] if c not in df.columns}
if missing:
st.error(f"ν•„μˆ˜ 컬럼 λˆ„λ½: {', '.join(missing)} β€” 파일 컬럼λͺ…을 ν™•μΈν•˜μ„Έμš”.")
st.stop()
# λ‚ μ§œ λ³€ν™˜
before_date_convert = len(df)
df["date"] = pd.to_datetime(df["date"], errors="coerce")
after_date_convert = df.dropna(subset=["date"]).shape[0]
if before_date_convert != after_date_convert:
st.warning(f"λ‚ μ§œ λ³€ν™˜ 쀑 {before_date_convert - after_date_convert}개 행이 μ œμ™Έλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
# NA 데이터 처리
before_na_drop = len(df)
df = df.dropna(subset=["date", "item", "price"])
after_na_drop = len(df)
if before_na_drop != after_na_drop:
st.warning(f"NA 제거 쀑 {before_na_drop - after_na_drop}개 행이 μ œμ™Έλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
df.sort_values("date", inplace=True)
# 데이터 λ‚ μ§œ λ²”μœ„ 확인
if len(df) > 0:
st.sidebar.write(f"데이터 λ‚ μ§œ λ²”μœ„: {df['date'].min().strftime('%Y-%m-%d')} ~ {df['date'].max().strftime('%Y-%m-%d')}")
st.sidebar.write(f"총 ν’ˆλͺ© 수: {df['item'].nunique()}")
else:
st.error("μœ νš¨ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€!")
return df
except Exception as e:
st.error(f"데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
# 였λ₯˜ 상세 정보 ν‘œμ‹œ
import traceback
st.code(traceback.format_exc())
st.stop()
@st.cache_data(show_spinner=False)
def get_items(df: pd.DataFrame):
return sorted(df["item"].unique())
@st.cache_data(show_spinner=False, ttl=3600)
def fit_prophet(df: pd.DataFrame, horizon_end: str):
# Make a copy and ensure we have data
df = df.copy()
df = df.dropna(subset=["date", "price"])
# 쀑볡 λ‚ μ§œ 처리 - 동일 λ‚ μ§œμ— μ—¬λŸ¬ 값이 있으면 평균값 μ‚¬μš©
df = df.groupby("date")["price"].mean().reset_index()
if len(df) < 2:
st.warning(f"데이터 ν¬μΈνŠΈκ°€ λΆ€μ‘±ν•©λ‹ˆλ‹€. μ˜ˆμΈ‘μ„ μœ„ν•΄μ„œλŠ” μ΅œμ†Œ 2개 μ΄μƒμ˜ 유효 데이터가 ν•„μš”ν•©λ‹ˆλ‹€. (ν˜„μž¬ {len(df)}개)")
return None, None
# Convert to Prophet format
prophet_df = df.rename(columns={"date": "ds", "price": "y"})
try:
# Fit the model
m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m.fit(prophet_df)
# Generate future dates
periods = max((pd.Timestamp(horizon_end) - df["date"].max()).days, 1)
future = m.make_future_dataframe(periods=periods, freq="D")
# Make predictions
forecast = m.predict(future)
return m, forecast
except Exception as e:
st.error(f"Prophet λͺ¨λΈ 생성 쀑 였λ₯˜: {str(e)}")
return None, None
# -------------------------------------------------
# LOAD DATA ---------------------------------------
# -------------------------------------------------
raw_df = load_data()
if len(raw_df) == 0:
st.error("데이터가 λΉ„μ–΄ μžˆμŠ΅λ‹ˆλ‹€. νŒŒμΌμ„ ν™•μΈν•΄μ£Όμ„Έμš”.")
st.stop()
st.sidebar.header("πŸ” ν’ˆλͺ© 선택")
selected_item = st.sidebar.selectbox("ν’ˆλͺ©", get_items(raw_df))
current_date = date.today()
st.sidebar.caption(f"였늘: {current_date}")
item_df = raw_df.query("item == @selected_item").copy()
if item_df.empty:
st.error("μ„ νƒν•œ ν’ˆλͺ© 데이터 μ—†μŒ")
st.stop()
# -------------------------------------------------
# MACRO FORECAST 1996‑2030 ------------------------
# -------------------------------------------------
st.header(f"πŸ“ˆ {selected_item} 가격 예츑 λŒ€μ‹œλ³΄λ“œ")
# 데이터 필터링 둜직 κ°œμ„ 
try:
macro_start_dt = pd.Timestamp(MACRO_START)
# 데이터가 μΆ©λΆ„ν•˜μ§€ μ•ŠμœΌλ©΄ μ‹œμž‘ λ‚ μ§œλ₯Ό μ‘°μ •
if len(item_df[item_df["date"] >= macro_start_dt]) < 10:
# κ°€μž₯ 였래된 λ‚ μ§œλΆ€ν„° μ‹œμž‘
macro_start_dt = item_df["date"].min()
st.info(f"μΆ©λΆ„ν•œ 데이터가 μ—†μ–΄ μ‹œμž‘ λ‚ μ§œλ₯Ό {macro_start_dt.strftime('%Y-%m-%d')}둜 μ‘°μ •ν–ˆμŠ΅λ‹ˆλ‹€.")
macro_df = item_df[item_df["date"] >= macro_start_dt].copy()
except Exception as e:
st.error(f"λ‚ μ§œ 필터링 였λ₯˜: {str(e)}")
macro_df = item_df.copy() # 필터링 없이 전체 데이터 μ‚¬μš©
# Add diagnostic info
with st.expander("데이터 진단"):
st.write(f"- 전체 데이터 수: {len(item_df)}")
st.write(f"- 뢄석 데이터 수: {len(macro_df)}")
if len(macro_df) > 0:
st.write(f"- κΈ°κ°„: {macro_df['date'].min().strftime('%Y-%m-%d')} ~ {macro_df['date'].max().strftime('%Y-%m-%d')}")
st.dataframe(macro_df.head())
else:
st.write("데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
if len(macro_df) < 2:
st.warning(f"{selected_item}에 λŒ€ν•œ 데이터가 μΆ©λΆ„ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€. 전체 κΈ°κ°„ 데이터λ₯Ό ν‘œμ‹œν•©λ‹ˆλ‹€.")
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} κ³Όκ±° 가격")
st.plotly_chart(fig, use_container_width=True)
else:
try:
with st.spinner("μž₯κΈ° 예츑 λͺ¨λΈ 생성 쀑..."):
m_macro, fc_macro = fit_prophet(macro_df, MACRO_END)
if m_macro is not None and fc_macro is not None:
fig_macro = px.line(fc_macro, x="ds", y="yhat", title="μž₯κΈ° 예츑 (1996–2030)")
fig_macro.add_scatter(x=macro_df["date"], y=macro_df["price"], mode="lines", name="μ‹€μ œ 가격")
st.plotly_chart(fig_macro, use_container_width=True)
latest_price = macro_df.iloc[-1]["price"]
# 2030λ…„ λ§ˆμ§€λ§‰ λ‚  μ°ΎκΈ°
target_date = pd.Timestamp(MACRO_END)
close_dates = fc_macro.loc[(fc_macro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
macro_pred = fc_macro.loc[fc_macro["ds"] == close_dates, "yhat"].iloc[0]
macro_pct = (macro_pred - latest_price) / latest_price * 100
st.metric("2030 μ˜ˆμΈ‘κ°€", f"{macro_pred:,.0f}", f"{macro_pct:+.1f}%")
else:
st.warning("예츑 λͺ¨λΈμ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} κ³Όκ±° 가격")
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"μž₯κΈ° 예츑 였λ₯˜ λ°œμƒ: {str(e)}")
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} κ³Όκ±° 가격")
st.plotly_chart(fig, use_container_width=True)
# -------------------------------------------------
# MICRO FORECAST 2024‑2026 ------------------------
# -------------------------------------------------
st.subheader("πŸ”Ž 2024–2026 단기 예츑")
# 데이터 필터링 둜직 κ°œμ„ 
try:
micro_start_dt = pd.Timestamp(MICRO_START)
# 데이터가 μΆ©λΆ„ν•˜μ§€ μ•ŠμœΌλ©΄ μ‹œμž‘ λ‚ μ§œλ₯Ό μ‘°μ •
if len(item_df[item_df["date"] >= micro_start_dt]) < 10:
# 졜근 30% λ°μ΄ν„°λ§Œ μ‚¬μš©
n = max(2, int(len(item_df) * 0.3))
micro_df = item_df.sort_values("date").tail(n).copy()
st.info(f"μΆ©λΆ„ν•œ 졜근 데이터가 μ—†μ–΄ 졜근 {n}개 데이터 포인트만 μ‚¬μš©ν•©λ‹ˆλ‹€.")
else:
micro_df = item_df[item_df["date"] >= micro_start_dt].copy()
except Exception as e:
st.error(f"단기 예츑 데이터 필터링 였λ₯˜: {str(e)}")
# 졜근 10개 데이터 포인트 μ‚¬μš©
micro_df = item_df.sort_values("date").tail(10).copy()
if len(micro_df) < 2:
st.warning(f"{MICRO_START} 이후 데이터가 μΆ©λΆ„ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 졜근 가격")
st.plotly_chart(fig, use_container_width=True)
else:
try:
with st.spinner("단기 예츑 λͺ¨λΈ 생성 쀑..."):
m_micro, fc_micro = fit_prophet(micro_df, MICRO_END)
if m_micro is not None and fc_micro is not None:
fig_micro = px.line(fc_micro, x="ds", y="yhat", title="단기 예츑 (2024–2026)")
fig_micro.add_scatter(x=micro_df["date"], y=micro_df["price"], mode="lines", name="μ‹€μ œ 가격")
st.plotly_chart(fig_micro, use_container_width=True)
latest_price = micro_df.iloc[-1]["price"]
target_date = pd.Timestamp(MICRO_END)
close_dates = fc_micro.loc[(fc_micro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
micro_pred = fc_micro.loc[fc_micro["ds"] == close_dates, "yhat"].iloc[0]
micro_pct = (micro_pred - latest_price) / latest_price * 100
st.metric("2026 μ˜ˆμΈ‘κ°€", f"{micro_pred:,.0f}", f"{micro_pct:+.1f}%")
else:
st.warning("단기 예츑 λͺ¨λΈμ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
except Exception as e:
st.error(f"단기 예츑 였λ₯˜: {str(e)}")
# -------------------------------------------------
# SEASONALITY & PATTERN ---------------------------
# -------------------------------------------------
with st.expander("πŸ“† μ‹œμ¦ˆλ„λ¦¬ν‹° & νŒ¨ν„΄ μ„€λͺ…"):
if 'm_micro' in locals() and m_micro is not None and 'fc_micro' in locals() and fc_micro is not None:
try:
comp_fig = m_micro.plot_components(fc_micro)
st.pyplot(comp_fig)
month_season = (fc_micro[["ds", "yearly"]]
.assign(month=lambda d: d.ds.dt.month)
.groupby("month")["yearly"].mean())
st.markdown(
f"**μ—°κ°„ 피크 μ›”:** {int(month_season.idxmax())}μ›” \n"
f"**μ—°κ°„ 저점 μ›”:** {int(month_season.idxmin())}μ›” \n"
f"**μ—°κ°„ 변동폭:** {month_season.max() - month_season.min():.1f}")
except Exception as e:
st.error(f"μ‹œμ¦ˆλ„λ¦¬ν‹° 뢄석 였λ₯˜: {str(e)}")
else:
st.info("νŒ¨ν„΄ 뢄석을 μœ„ν•œ μΆ©λΆ„ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
# -------------------------------------------------
# FOOTER ------------------------------------------
# -------------------------------------------------
st.markdown("---")
st.caption("Β© 2025 ν’ˆλͺ©λ³„ 가격 예츑 μ‹œμŠ€ν…œ | 데이터 뢄석 μžλ™ν™”")