NH-Prediction

Running

App Files Files Community

NH-Prediction / app.py

yokoha

Update app.py

277a313 verified about 1 month ago

raw

history blame

24.8 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from prophet import Prophet
	import plotly.express as px
	import plotly.graph_objects as go
	import matplotlib.pyplot as plt
	from datetime import date
	from pathlib import Path
	import matplotlib.font_manager as fm
	import matplotlib as mpl

	# -------------------------------------------------
	# CONFIG ------------------------------------------
	# -------------------------------------------------
	CSV_PATH = Path("2025-domae.csv")
	MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
	MICRO_START, MICRO_END = "2024-01-01", "2026-12-31"

	# 한글 폰트 설정
	font_list = [f.name for f in fm.fontManager.ttflist if 'gothic' in f.name.lower() or
	'gulim' in f.name.lower() or 'malgun' in f.name.lower() or
	'nanum' in f.name.lower() or 'batang' in f.name.lower()]

	if font_list:
	font_name = font_list[0]
	plt.rcParams['font.family'] = font_name
	mpl.rcParams['axes.unicode_minus'] = False
	else:
	plt.rcParams['font.family'] = 'DejaVu Sans'

	st.set_page_config(page_title="품목별 가격 예측", page_icon="📈", layout="wide")

	# -------------------------------------------------
	# UTILITIES ---------------------------------------
	# -------------------------------------------------
	DATE_CANDIDATES = {"date", "ds", "ymd", "날짜", "prce_reg_mm", "etl_ldg_dt"}
	ITEM_CANDIDATES = {"item", "품목", "code", "category", "pdlt_nm", "spcs_nm"}
	PRICE_CANDIDATES = {"price", "y", "value", "가격", "avrg_prce"}


	def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
	"""Standardize column names to date/item/price and deduplicate."""
	col_map = {}
	for c in df.columns:
	lc = c.lower()
	if lc in DATE_CANDIDATES:
	col_map[c] = "date"
	elif lc in PRICE_CANDIDATES:
	col_map[c] = "price"
	elif lc in ITEM_CANDIDATES:
	# first hit as item, second as species
	if "item" not in col_map.values():
	col_map[c] = "item"
	else:
	col_map[c] = "species"
	df = df.rename(columns=col_map)

	# ── handle duplicated columns after rename ─────────────────────────
	if df.columns.duplicated().any():
	df = df.loc[:, ~df.columns.duplicated()]

	# ── index datetime to column ───────────────────────────────────────
	if "date" not in df.columns and df.index.dtype.kind == "M":
	df.reset_index(inplace=True)
	df.rename(columns={df.columns[0]: "date"}, inplace=True)

	# ── convert YYYYMM string to datetime ──────────────────────────────
	if "date" in df.columns and pd.api.types.is_object_dtype(df["date"]):
	if len(df) > 0:
	sample = str(df["date"].iloc[0])
	if sample.isdigit() and len(sample) == 6: # YYYYMM 형식 확인
	# 월 말일로 변환 (YYYYMM -> YYYY-MM-DD)
	df["date"] = pd.to_datetime(df["date"].astype(str), format="%Y%m", errors="coerce")
	df["date"] = df["date"] + pd.offsets.MonthEnd(0) # 해당 월의 마지막 날로 설정
	elif sample.isdigit() and len(sample) == 8: # YYYYMMDD 형식
	df["date"] = pd.to_datetime(df["date"].astype(str), format="%Y%m%d", errors="coerce")

	# ── build item from pdlt_nm + spcs_nm if needed ────────────────────
	if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
	df["item"] = df["pdlt_nm"].str.strip() + "-" + df["spcs_nm"].str.strip()

	# ── merge item + species ───────────────────────────────────────────
	if {"item", "species"}.issubset(df.columns):
	df["item"] = df["item"].astype(str).str.strip() + "-" + df["species"].astype(str).str.strip()
	df.drop(columns=["species"], inplace=True)

	return df


	@st.cache_data(show_spinner=False)
	def load_data() -> pd.DataFrame:
	"""Load price data from CSV file."""
	try:
	if not CSV_PATH.exists():
	st.error(f"💾 {CSV_PATH} 파일을 찾을 수 없습니다.")
	st.stop()

	st.sidebar.info(f"{CSV_PATH} 파일에서 데이터를 불러옵니다.")

	# CSV 파일 직접 로드
	df = pd.read_csv(CSV_PATH)
	st.sidebar.success(f"CSV 데이터 로드 완료: {len(df)}개 행")

	# 원본 데이터 형태 확인
	st.sidebar.write("원본 데이터 컬럼:", list(df.columns))

	df = _standardize_columns(df)
	st.sidebar.write("표준화 후 컬럼:", list(df.columns))

	missing = {c for c in ["date", "item", "price"] if c not in df.columns}
	if missing:
	st.error(f"필수 컬럼 누락: {', '.join(missing)} — 파일 컬럼명을 확인하세요.")
	st.stop()

	# 날짜 변환
	before_date_convert = len(df)
	df["date"] = pd.to_datetime(df["date"], errors="coerce")
	after_date_convert = df.dropna(subset=["date"]).shape[0]
	if before_date_convert != after_date_convert:
	st.warning(f"날짜 변환 중 {before_date_convert - after_date_convert}개 행이 제외되었습니다.")

	# 가격 데이터 정수형으로 변환 (숫자가 아닌 값 제거)
	df["price"] = pd.to_numeric(df["price"], errors="coerce")

	# NA 데이터 처리
	before_na_drop = len(df)
	df = df.dropna(subset=["date", "item", "price"])
	after_na_drop = len(df)
	if before_na_drop != after_na_drop:
	st.warning(f"NA 제거 중 {before_na_drop - after_na_drop}개 행이 제외되었습니다.")

	df.sort_values("date", inplace=True)

	# 데이터 날짜 범위 확인
	if len(df) > 0:
	st.sidebar.write(f"데이터 날짜 범위: {df['date'].min().strftime('%Y-%m-%d')} ~ {df['date'].max().strftime('%Y-%m-%d')}")
	st.sidebar.write(f"총 품목 수: {df['item'].nunique()}")
	else:
	st.error("유효한 데이터가 없습니다!")

	return df
	except Exception as e:
	st.error(f"데이터 로드 중 오류 발생: {str(e)}")
	# 오류 상세 정보 표시
	import traceback
	st.code(traceback.format_exc())
	st.stop()


	@st.cache_data(show_spinner=False)
	def get_items(df: pd.DataFrame):
	return sorted(df["item"].unique())


	@st.cache_data(show_spinner=False, ttl=3600)
	def fit_prophet(df: pd.DataFrame, horizon_end: str, monthly=False, changepoint_prior_scale=0.05):
	"""
	Prophet 모델을 학습시키고 예측합니다.

	Args:
	df: 학습 데이터 (date, price 컬럼 필요)
	horizon_end: 예측 종료일
	monthly: 월 단위 예측 여부
	changepoint_prior_scale: 변화점 민감도 (낮을수록 과적합 감소)
	"""
	# Make a copy and ensure we have data
	df = df.copy()
	df = df.dropna(subset=["date", "price"])

	# 이상치 제거 (99 퍼센타일 초과 가격 제외)
	upper_limit = df["price"].quantile(0.99)
	df = df[df["price"] <= upper_limit]

	# 중복 날짜 처리
	if monthly:
	# 월 단위로 집계
	df["year_month"] = df["date"].dt.strftime('%Y-%m')
	df = df.groupby("year_month").agg({"date": "first", "price": "mean"}).reset_index(drop=True)
	else:
	# 일 단위로 집계
	df = df.groupby("date")["price"].mean().reset_index()

	if len(df) < 2:
	st.warning(f"데이터 포인트가 부족합니다. 예측을 위해서는 최소 2개 이상의 유효 데이터가 필요합니다. (현재 {len(df)}개)")
	return None, None

	# Convert to Prophet format
	prophet_df = df.rename(columns={"date": "ds", "price": "y"})

	try:
	# Fit the model with tuned parameters
	m = Prophet(
	yearly_seasonality=True,
	weekly_seasonality=False,
	daily_seasonality=False,
	changepoint_prior_scale=changepoint_prior_scale, # 과적합 방지
	seasonality_prior_scale=10.0, # 계절성 조정
	seasonality_mode='multiplicative' # 곱셈 모드 (가격 데이터에 적합)
	)

	# 한국 명절 효과 추가 (설날, 추석)
	m.add_country_holidays(country_name='South Korea')

	m.fit(prophet_df)

	# Generate future dates
	if monthly:
	# 월 단위 예측
	future_periods = (pd.Timestamp(horizon_end).year - df["date"].max().year) * 12 + \
	(pd.Timestamp(horizon_end).month - df["date"].max().month) + 1
	future = m.make_future_dataframe(periods=future_periods, freq='MS') # 월 시작일
	future = future.resample('MS', on='ds').first().reset_index() # 중복 제거
	else:
	# 일 단위 예측
	periods = max((pd.Timestamp(horizon_end) - df["date"].max()).days, 1)
	future = m.make_future_dataframe(periods=periods, freq="D")

	# Make predictions
	forecast = m.predict(future)

	# 예측값 범위 조정 (음수 예측 방지 및 상한값 설정)
	forecast['yhat'] = np.maximum(forecast['yhat'], 0) # 음수 제거
	max_historical = prophet_df['y'].max() * 5 # 최대 역사적 가격의 5배로 제한
	forecast['yhat'] = np.minimum(forecast['yhat'], max_historical) # 상한값 설정

	return m, forecast
	except Exception as e:
	st.error(f"Prophet 모델 생성 중 오류: {str(e)}")
	return None, None


	def format_currency(value):
	"""원화 형식으로 숫자 포맷팅"""
	return f"{value:,.0f}원"


	# -------------------------------------------------
	# LOAD DATA ---------------------------------------
	# -------------------------------------------------
	raw_df = load_data()

	if len(raw_df) == 0:
	st.error("데이터가 비어 있습니다. 파일을 확인해주세요.")
	st.stop()

	st.sidebar.header("🔍 품목 선택")
	selected_item = st.sidebar.selectbox("품목", get_items(raw_df))
	current_date = date.today()
	st.sidebar.caption(f"오늘: {current_date}")

	item_df = raw_df.query("item == @selected_item").copy()
	if item_df.empty:
	st.error("선택한 품목 데이터 없음")
	st.stop()

	# -------------------------------------------------
	# MACRO FORECAST 1996‑2030 ------------------------
	# -------------------------------------------------
	st.header(f"📈 {selected_item} 가격 예측 대시보드")

	# 데이터 필터링 로직
	try:
	macro_start_dt = pd.Timestamp("1996-01-01")
	# 데이터의 시작일이 1996년 이후인지 확인
	if item_df["date"].min() > macro_start_dt:
	macro_start_dt = item_df["date"].min()

	macro_df = item_df[item_df["date"] >= macro_start_dt].copy()
	except Exception as e:
	st.error(f"날짜 필터링 오류: {str(e)}")
	macro_df = item_df.copy() # 필터링 없이 전체 데이터 사용

	# Add diagnostic info
	with st.expander("데이터 진단"):
	st.write(f"- 전체 데이터 수: {len(item_df)}")
	st.write(f"- 분석 데이터 수: {len(macro_df)}")
	if len(macro_df) > 0:
	st.write(f"- 기간: {macro_df['date'].min().strftime('%Y-%m-%d')} ~ {macro_df['date'].max().strftime('%Y-%m-%d')}")
	st.dataframe(macro_df.head())
	else:
	st.write("데이터가 없습니다.")

	if len(macro_df) < 2:
	st.warning(f"{selected_item}에 대한 데이터가 충분하지 않습니다. 전체 기간 데이터를 표시합니다.")
	fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 과거 가격")
	st.plotly_chart(fig, use_container_width=True)
	else:
	try:
	with st.spinner("장기 예측 모델 생성 중..."):
	# 월 단위 예측으로 변경
	m_macro, fc_macro = fit_prophet(macro_df, MACRO_END, monthly=True, changepoint_prior_scale=0.01)

	if m_macro is not None and fc_macro is not None:
	# 실제 데이터와 예측 데이터 구분
	cutoff_date = pd.Timestamp("2025-01-01")

	# 플롯 생성
	fig = go.Figure()

	# 실제 데이터 추가 (1996-2024)
	historical_data = macro_df[macro_df["date"] < cutoff_date].copy()
	if not historical_data.empty:
	fig.add_trace(go.Scatter(
	x=historical_data["date"],
	y=historical_data["price"],
	mode="lines",
	name="실제 가격 (1996-2024)",
	line=dict(color="blue", width=2)
	))

	# 예측 데이터 추가 (2025-2030)
	forecast_data = fc_macro[fc_macro["ds"] >= cutoff_date].copy()
	if not forecast_data.empty:
	fig.add_trace(go.Scatter(
	x=forecast_data["ds"],
	y=forecast_data["yhat"],
	mode="lines",
	name="예측 가격 (2025-2030)",
	line=dict(color="red", width=2, dash="dash")
	))

	# 신뢰 구간 추가
	fig.add_trace(go.Scatter(
	x=forecast_data["ds"],
	y=forecast_data["yhat_upper"],
	mode="lines",
	line=dict(width=0),
	showlegend=False
	))
	fig.add_trace(go.Scatter(
	x=forecast_data["ds"],
	y=forecast_data["yhat_lower"],
	mode="lines",
	line=dict(width=0),
	fill="tonexty",
	fillcolor="rgba(255, 0, 0, 0.1)",
	name="95% 신뢰 구간"
	))

	# 레이아웃 설정
	fig.update_layout(
	title=f"{selected_item} 장기 가격 예측 (1996-2030)",
	xaxis_title="연도",
	yaxis_title="가격 (원)",
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	)
	)

	# 차트 표시
	st.plotly_chart(fig, use_container_width=True)

	# 2030년 예측가 표시
	try:
	latest_price = macro_df.iloc[-1]["price"]
	# 2030년 마지막 월 찾기
	target_date = pd.Timestamp("2030-12-31")
	close_dates = fc_macro.loc[(fc_macro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
	macro_pred = fc_macro.loc[fc_macro["ds"] == close_dates, "yhat"].iloc[0]
	macro_pct = (macro_pred - latest_price) / latest_price * 100

	col1, col2 = st.columns(2)
	with col1:
	st.metric("현재 가격", format_currency(latest_price))
	with col2:
	st.metric("2030년 예측가", format_currency(macro_pred), f"{macro_pct:+.1f}%")
	except Exception as e:
	st.error(f"예측가 계산 오류: {str(e)}")
	else:
	st.warning("예측 모델을 생성할 수 없습니다.")
	fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 과거 가격")
	st.plotly_chart(fig, use_container_width=True)
	except Exception as e:
	st.error(f"장기 예측 오류 발생: {str(e)}")
	fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 과거 가격")
	st.plotly_chart(fig, use_container_width=True)

	# -------------------------------------------------
	# MICRO FORECAST 2024‑2026 ------------------------
	# -------------------------------------------------
	st.subheader("🔎 2024–2026 단기 예측 (월별)")

	# 데이터 필터링 - 최근 3년 데이터 활용
	try:
	three_years_ago = pd.Timestamp("2021-01-01")
	if item_df["date"].min() > three_years_ago:
	three_years_ago = item_df["date"].min()

	micro_df = item_df[item_df["date"] >= three_years_ago].copy()
	except Exception as e:
	st.error(f"단기 예측 데이터 필터링 오류: {str(e)}")
	# 최근 데이터 사용
	micro_df = item_df.sort_values("date").tail(24).copy()

	if len(micro_df) < 2:
	st.warning(f"최근 데이터가 충분하지 않습니다.")
	fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 최근 가격")
	st.plotly_chart(fig, use_container_width=True)
	else:
	try:
	with st.spinner("단기 예측 모델 생성 중..."):
	# 월 단위 예측으로 변경
	m_micro, fc_micro = fit_prophet(micro_df, MICRO_END, monthly=True, changepoint_prior_scale=0.05)

	if m_micro is not None and fc_micro is not None:
	# 2024-01-01부터 2026-12-31까지 필터링
	start_date = pd.Timestamp("2024-01-01")
	end_date = pd.Timestamp("2026-12-31")

	# 월별 데이터 준비
	monthly_historical = micro_df.copy()
	monthly_historical["year_month"] = monthly_historical["date"].dt.strftime("%Y-%m")
	monthly_historical = monthly_historical.groupby("year_month").agg({
	"date": "first",
	"price": "mean"
	}).reset_index(drop=True)

	monthly_historical = monthly_historical[
	(monthly_historical["date"] >= start_date) &
	(monthly_historical["date"] <= end_date)
	]

	monthly_forecast = fc_micro[
	(fc_micro["ds"] >= start_date) &
	(fc_micro["ds"] <= end_date)
	].copy()

	# 월별 차트 생성
	fig = go.Figure()

	# 2024년 실제 데이터
	actual_2024 = monthly_historical[
	(monthly_historical["date"] >= pd.Timestamp("2024-01-01")) &
	(monthly_historical["date"] <= pd.Timestamp("2024-12-31"))
	]

	if not actual_2024.empty:
	fig.add_trace(go.Scatter(
	x=actual_2024["date"],
	y=actual_2024["price"],
	mode="lines+markers",
	name="2024 실제 가격",
	line=dict(color="blue", width=2),
	marker=dict(size=8)
	))

	# 2024년 이후 예측 데이터
	cutoff = pd.Timestamp("2024-12-31")
	future_data = monthly_forecast[monthly_forecast["ds"] > cutoff]

	if not future_data.empty:
	fig.add_trace(go.Scatter(
	x=future_data["ds"],
	y=future_data["yhat"],
	mode="lines+markers",
	name="2025-2026 예측 가격",
	line=dict(color="red", width=2, dash="dash"),
	marker=dict(size=8)
	))

	# 신뢰 구간 추가
	fig.add_trace(go.Scatter(
	x=future_data["ds"],
	y=future_data["yhat_upper"],
	mode="lines",
	line=dict(width=0),
	showlegend=False
	))
	fig.add_trace(go.Scatter(
	x=future_data["ds"],
	y=future_data["yhat_lower"],
	mode="lines",
	line=dict(width=0),
	fill="tonexty",
	fillcolor="rgba(255, 0, 0, 0.1)",
	name="95% 신뢰 구간"
	))

	# 레이아웃 설정
	fig.update_layout(
	title=f"{selected_item} 월별 단기 예측 (2024-2026)",
	xaxis_title="월",
	yaxis_title="가격 (원)",
	xaxis=dict(
	tickformat="%Y-%m",
	dtick="M3", # 3개월 간격
	tickangle=45
	),
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	)
	)

	# 차트 표시
	st.plotly_chart(fig, use_container_width=True)

	# 월별 예측 가격 표시 (2025-2026)
	with st.expander("월별 예측 가격 상세보기"):
	monthly_detail = monthly_forecast[monthly_forecast["ds"] > cutoff].copy()
	monthly_detail["날짜"] = monthly_detail["ds"].dt.strftime("%Y년 %m월")
	monthly_detail["예측가격"] = monthly_detail["yhat"].apply(format_currency)
	monthly_detail["하한값"] = monthly_detail["yhat_lower"].apply(format_currency)
	monthly_detail["상한값"] = monthly_detail["yhat_upper"].apply(format_currency)

	st.dataframe(
	monthly_detail[["날짜", "예측가격", "하한값", "상한값"]],
	hide_index=True
	)

	# 2026년 예측가 표시
	try:
	latest_price = monthly_historical.iloc[-1]["price"] if not monthly_historical.empty else micro_df.iloc[-1]["price"]

	# 2026년 마지막 월 찾기
	target_date = pd.Timestamp("2026-12-31")
	close_dates = monthly_forecast.loc[(monthly_forecast["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
	micro_pred = monthly_forecast.loc[monthly_forecast["ds"] == close_dates, "yhat"].iloc[0]
	micro_pct = (micro_pred - latest_price) / latest_price * 100

	col1, col2 = st.columns(2)
	with col1:
	st.metric("현재 가격", format_currency(latest_price))
	with col2:
	st.metric("2026년 12월 예측가", format_currency(micro_pred), f"{micro_pct:+.1f}%")
	except Exception as e:
	st.error(f"예측가 계산 오류: {str(e)}")
	else:
	st.warning("단기 예측 모델을 생성할 수 없습니다.")
	except Exception as e:
	st.error(f"단기 예측 오류: {str(e)}")

	# -------------------------------------------------
	# SEASONALITY & PATTERN ---------------------------
	# -------------------------------------------------
	with st.expander("📆 시즈널리티 & 패턴 설명"):
	if 'm_micro' in locals() and m_micro is not None and 'fc_micro' in locals() and fc_micro is not None:
	try:
	comp_fig = m_micro.plot_components(fc_micro)
	st.pyplot(comp_fig)

	month_season = (fc_micro[["ds", "yearly"]]
	.assign(month=lambda d: d.ds.dt.month)
	.groupby("month")["yearly"].mean())
	st.markdown(
	f"연간 피크 월: {int(month_season.idxmax())}월 \n"
	f"연간 저점 월: {int(month_season.idxmin())}월 \n"
	f"연간 변동폭: {month_season.max() - month_season.min():.1f}")

	# 월별 계절성 차트
	month_names = ["1월", "2월", "3월", "4월", "5월", "6월", "7월", "8월", "9월", "10월", "11월", "12월"]
	month_values = month_season.values

	fig = px.bar(
	x=month_names,
	y=month_values,
	title=f"{selected_item} 월별 가격 변동 패턴",
	labels={"x": "월", "y": "상대적 가격 변동"}
	)

	st.plotly_chart(fig, use_container_width=True)
	except Exception as e:
	st.error(f"시즈널리티 분석 오류: {str(e)}")
	else:
	st.info("패턴 분석을 위한 충분한 데이터가 없습니다.")

	# -------------------------------------------------
	# FOOTER ------------------------------------------
	# -------------------------------------------------
	st.markdown("---")
	st.caption("© 2025 품목별 가격 예측 시스템 \| 데이터 분석 자동화")