NH-Prediction

Running

App Files Files Community

yokoha commited on Apr 30

Commit

277a313

verified ·

1 Parent(s): 2a6cacd

Update app.py

Browse files

Files changed (1) hide show

app.py +302 -52

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 import numpy as np
 from prophet import Prophet
 import plotly.express as px
 import matplotlib.pyplot as plt
 from datetime import date
 from pathlib import Path
@@ -12,9 +13,9 @@ import matplotlib as mpl
 # -------------------------------------------------
 # CONFIG ------------------------------------------
 # -------------------------------------------------
-CSV_PATH = Path("2025-domae.csv")  # 파일 경로 수정
 MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
-MICRO_START, MICRO_END = "2020-01-01", "2026-12-31"
 # 한글 폰트 설정
 font_list = [f.name for f in fm.fontManager.ttflist if 'gothic' in f.name.lower() or
@@ -68,8 +69,12 @@ def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
     if "date" in df.columns and pd.api.types.is_object_dtype(df["date"]):
         if len(df) > 0:
             sample = str(df["date"].iloc[0])
-            if sample.isdigit() and len(sample) in (6, 8):
-                df["date"] = pd.to_datetime(df["date"].astype(str).str[:6], format="%Y%m", errors="coerce")
     # ── build item from pdlt_nm + spcs_nm if needed ────────────────────
     if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
@@ -115,6 +120,9 @@ def load_data() -> pd.DataFrame:
         if before_date_convert != after_date_convert:
             st.warning(f"날짜 변환 중 {before_date_convert - after_date_convert}개 행이 제외되었습니다.")
         # NA 데이터 처리
         before_na_drop = len(df)
         df = df.dropna(subset=["date", "item", "price"])
@@ -146,13 +154,32 @@ def get_items(df: pd.DataFrame):
 @st.cache_data(show_spinner=False, ttl=3600)
-def fit_prophet(df: pd.DataFrame, horizon_end: str):
     # Make a copy and ensure we have data
     df = df.copy()
     df = df.dropna(subset=["date", "price"])
-    # 중복 날짜 처리 - 동일 날짜에 여러 값이 있으면 평균값 사용
-    df = df.groupby("date")["price"].mean().reset_index()
     if len(df) < 2:
         st.warning(f"데이터 포인트가 부족합니다. 예측을 위해서는 최소 2개 이상의 유효 데이터가 필요합니다. (현재 {len(df)}개)")
@@ -162,21 +189,52 @@ def fit_prophet(df: pd.DataFrame, horizon_end: str):
     prophet_df = df.rename(columns={"date": "ds", "price": "y"})
     try:
-        # Fit the model
-        m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
         m.fit(prophet_df)
         # Generate future dates
-        periods = max((pd.Timestamp(horizon_end) - df["date"].max()).days, 1)
-        future = m.make_future_dataframe(periods=periods, freq="D")
         # Make predictions
         forecast = m.predict(future)
         return m, forecast
     except Exception as e:
         st.error(f"Prophet 모델 생성 중 오류: {str(e)}")
         return None, None
 # -------------------------------------------------
 # LOAD DATA ---------------------------------------
 # -------------------------------------------------
@@ -201,14 +259,12 @@ if item_df.empty:
 # -------------------------------------------------
 st.header(f"📈 {selected_item} 가격 예측 대시보드")
-# 데이터 필터링 로직 개선
 try:
-    macro_start_dt = pd.Timestamp(MACRO_START)
-    # 데이터가 충분하지 않으면 시작 날짜를 조정
-    if len(item_df[item_df["date"] >= macro_start_dt]) < 10:
-        # 가장 오래된 날짜부터 시작
         macro_start_dt = item_df["date"].min()
-        st.info(f"충분한 데이터가 없어 시작 날짜를 {macro_start_dt.strftime('%Y-%m-%d')}로 조정했습니다.")
     macro_df = item_df[item_df["date"] >= macro_start_dt].copy()
 except Exception as e:
@@ -232,20 +288,89 @@ if len(macro_df) < 2:
 else:
     try:
         with st.spinner("장기 예측 모델 생성 중..."):
-            m_macro, fc_macro = fit_prophet(macro_df, MACRO_END)
         if m_macro is not None and fc_macro is not None:
-            fig_macro = px.line(fc_macro, x="ds", y="yhat", title="장기 예측 (1996–2030)")
-            fig_macro.add_scatter(x=macro_df["date"], y=macro_df["price"], mode="lines", name="실제 가격")
-            st.plotly_chart(fig_macro, use_container_width=True)
-            latest_price = macro_df.iloc[-1]["price"]
-            # 2030년 마지막 날 찾기
-            target_date = pd.Timestamp(MACRO_END)
-            close_dates = fc_macro.loc[(fc_macro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
-            macro_pred = fc_macro.loc[fc_macro["ds"] == close_dates, "yhat"].iloc[0]
-            macro_pct = (macro_pred - latest_price) / latest_price * 100
-            st.metric("2030 예측가", f"{macro_pred:,.0f}", f"{macro_pct:+.1f}%")
         else:
             st.warning("예측 모델을 생성할 수 없습니다.")
             fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 과거 가격")
@@ -258,44 +383,156 @@ else:
 # -------------------------------------------------
 # MICRO FORECAST 2024‑2026 ------------------------
 # -------------------------------------------------
-st.subheader("🔎 2024–2026 단기 예측")
-# 데이터 필터링 로직 개선
 try:
-    micro_start_dt = pd.Timestamp(MICRO_START)
-    # 데이터가 충분하지 않으면 시작 날짜를 조정
-    if len(item_df[item_df["date"] >= micro_start_dt]) < 10:
-        # 최근 30% 데이터만 사용
-        n = max(2, int(len(item_df) * 0.3))
-        micro_df = item_df.sort_values("date").tail(n).copy()
-        st.info(f"충분한 최근 데이터가 없어 최근 {n}개 데이터 포인트만 사용합니다.")
-    else:
-        micro_df = item_df[item_df["date"] >= micro_start_dt].copy()
 except Exception as e:
     st.error(f"단기 예측 데이터 필터링 오류: {str(e)}")
-    # 최근 10개 데이터 포인트 사용
-    micro_df = item_df.sort_values("date").tail(10).copy()
 if len(micro_df) < 2:
-    st.warning(f"{MICRO_START} 이후 데이터가 충분하지 않습니다.")
     fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 최근 가격")
     st.plotly_chart(fig, use_container_width=True)
 else:
     try:
         with st.spinner("단기 예측 모델 생성 중..."):
-            m_micro, fc_micro = fit_prophet(micro_df, MICRO_END)
         if m_micro is not None and fc_micro is not None:
-            fig_micro = px.line(fc_micro, x="ds", y="yhat", title="단기 예측 (2024–2026)")
-            fig_micro.add_scatter(x=micro_df["date"], y=micro_df["price"], mode="lines", name="실제 가격")
-            st.plotly_chart(fig_micro, use_container_width=True)
-            latest_price = micro_df.iloc[-1]["price"]
-            target_date = pd.Timestamp(MICRO_END)
-            close_dates = fc_micro.loc[(fc_micro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
-            micro_pred = fc_micro.loc[fc_micro["ds"] == close_dates, "yhat"].iloc[0]
-            micro_pct = (micro_pred - latest_price) / latest_price * 100
-            st.metric("2026 예측가", f"{micro_pred:,.0f}", f"{micro_pct:+.1f}%")
         else:
             st.warning("단기 예측 모델을 생성할 수 없습니다.")
     except Exception as e:
@@ -317,6 +554,19 @@ with st.expander("📆 시즈널리티 & 패턴 설명"):
                 f"**연간 피크 월:** {int(month_season.idxmax())}월  \n"
                 f"**연간 저점 월:** {int(month_season.idxmin())}월  \n"
                 f"**연간 변동폭:** {month_season.max() - month_season.min():.1f}")
         except Exception as e:
             st.error(f"시즈널리티 분석 오류: {str(e)}")
     else:

 import numpy as np
 from prophet import Prophet
 import plotly.express as px
+import plotly.graph_objects as go
 import matplotlib.pyplot as plt
 from datetime import date
 from pathlib import Path
 # -------------------------------------------------
 # CONFIG ------------------------------------------
 # -------------------------------------------------
+CSV_PATH = Path("2025-domae.csv")
 MACRO_START, MACRO_END = "1996-01-01", "2030-12-31"
+MICRO_START, MICRO_END = "2024-01-01", "2026-12-31"
 # 한글 폰트 설정
 font_list = [f.name for f in fm.fontManager.ttflist if 'gothic' in f.name.lower() or
     if "date" in df.columns and pd.api.types.is_object_dtype(df["date"]):
         if len(df) > 0:
             sample = str(df["date"].iloc[0])
+            if sample.isdigit() and len(sample) == 6:  # YYYYMM 형식 확인
+                # 월 말일로 변환 (YYYYMM -> YYYY-MM-DD)
+                df["date"] = pd.to_datetime(df["date"].astype(str), format="%Y%m", errors="coerce")
+                df["date"] = df["date"] + pd.offsets.MonthEnd(0)  # 해당 월의 마지막 날로 설정
+            elif sample.isdigit() and len(sample) == 8:  # YYYYMMDD 형식
+                df["date"] = pd.to_datetime(df["date"].astype(str), format="%Y%m%d", errors="coerce")
     # ── build item from pdlt_nm + spcs_nm if needed ────────────────────
     if "item" not in df.columns and {"pdlt_nm", "spcs_nm"}.issubset(df.columns):
         if before_date_convert != after_date_convert:
             st.warning(f"날짜 변환 중 {before_date_convert - after_date_convert}개 행이 제외되었습니다.")
+        # 가격 데이터 정수형으로 변환 (숫자가 아닌 값 제거)
+        df["price"] = pd.to_numeric(df["price"], errors="coerce")
         # NA 데이터 처리
         before_na_drop = len(df)
         df = df.dropna(subset=["date", "item", "price"])
 @st.cache_data(show_spinner=False, ttl=3600)
+def fit_prophet(df: pd.DataFrame, horizon_end: str, monthly=False, changepoint_prior_scale=0.05):
+    """
+    Prophet 모델을 학습시키고 예측합니다.
+    Args:
+        df: 학습 데이터 (date, price 컬럼 필요)
+        horizon_end: 예측 종료일
+        monthly: 월 단위 예측 여부
+        changepoint_prior_scale: 변화점 민감도 (낮을수록 과적합 감소)
+    """
     # Make a copy and ensure we have data
     df = df.copy()
     df = df.dropna(subset=["date", "price"])
+    # 이상치 제거 (99 퍼센타일 초과 가격 제외)
+    upper_limit = df["price"].quantile(0.99)
+    df = df[df["price"] <= upper_limit]
+    # 중복 날짜 처리
+    if monthly:
+        # 월 단위로 집계
+        df["year_month"] = df["date"].dt.strftime('%Y-%m')
+        df = df.groupby("year_month").agg({"date": "first", "price": "mean"}).reset_index(drop=True)
+    else:
+        # 일 단위로 집계
+        df = df.groupby("date")["price"].mean().reset_index()
     if len(df) < 2:
         st.warning(f"데이터 포인트가 부족합니다. 예측을 위해서는 최소 2개 이상의 유효 데이터가 필요합니다. (현재 {len(df)}개)")
     prophet_df = df.rename(columns={"date": "ds", "price": "y"})
     try:
+        # Fit the model with tuned parameters
+        m = Prophet(
+            yearly_seasonality=True,
+            weekly_seasonality=False,
+            daily_seasonality=False,
+            changepoint_prior_scale=changepoint_prior_scale,  # 과적합 방지
+            seasonality_prior_scale=10.0,  # 계절성 조정
+            seasonality_mode='multiplicative'  # 곱셈 모드 (가격 데이터에 적합)
+        )
+        # 한국 명절 효과 추가 (설날, 추석)
+        m.add_country_holidays(country_name='South Korea')
         m.fit(prophet_df)
         # Generate future dates
+        if monthly:
+            # 월 단위 예측
+            future_periods = (pd.Timestamp(horizon_end).year - df["date"].max().year) * 12 + \
+                             (pd.Timestamp(horizon_end).month - df["date"].max().month) + 1
+            future = m.make_future_dataframe(periods=future_periods, freq='MS')  # 월 시작일
+            future = future.resample('MS', on='ds').first().reset_index()  # 중복 제거
+        else:
+            # 일 단위 예측
+            periods = max((pd.Timestamp(horizon_end) - df["date"].max()).days, 1)
+            future = m.make_future_dataframe(periods=periods, freq="D")
         # Make predictions
         forecast = m.predict(future)
+        # 예측값 범위 조정 (음수 예측 방지 및 상한값 설정)
+        forecast['yhat'] = np.maximum(forecast['yhat'], 0)  # 음수 제거
+        max_historical = prophet_df['y'].max() * 5  # 최대 역사적 가격의 5배로 제한
+        forecast['yhat'] = np.minimum(forecast['yhat'], max_historical)  # 상한값 설정
         return m, forecast
     except Exception as e:
         st.error(f"Prophet 모델 생성 중 오류: {str(e)}")
         return None, None
+def format_currency(value):
+    """원화 형식으로 숫자 포맷팅"""
+    return f"{value:,.0f}원"
 # -------------------------------------------------
 # LOAD DATA ---------------------------------------
 # -------------------------------------------------
 # -------------------------------------------------
 st.header(f"📈 {selected_item} 가격 예측 대시보드")
+# 데이터 필터링 로직
 try:
+    macro_start_dt = pd.Timestamp("1996-01-01")
+    # 데이터의 시작일이 1996년 이후인지 확인
+    if item_df["date"].min() > macro_start_dt:
         macro_start_dt = item_df["date"].min()
     macro_df = item_df[item_df["date"] >= macro_start_dt].copy()
 except Exception as e:
 else:
     try:
         with st.spinner("장기 예측 모델 생성 중..."):
+            # 월 단위 예측으로 변경
+            m_macro, fc_macro = fit_prophet(macro_df, MACRO_END, monthly=True, changepoint_prior_scale=0.01)
         if m_macro is not None and fc_macro is not None:
+            # 실제 데이터와 예측 데이터 구분
+            cutoff_date = pd.Timestamp("2025-01-01")
+            # 플롯 생성
+            fig = go.Figure()
+            # 실제 데이터 추가 (1996-2024)
+            historical_data = macro_df[macro_df["date"] < cutoff_date].copy()
+            if not historical_data.empty:
+                fig.add_trace(go.Scatter(
+                    x=historical_data["date"],
+                    y=historical_data["price"],
+                    mode="lines",
+                    name="실제 가격 (1996-2024)",
+                    line=dict(color="blue", width=2)
+                ))
+            # 예측 데이터 추가 (2025-2030)
+            forecast_data = fc_macro[fc_macro["ds"] >= cutoff_date].copy()
+            if not forecast_data.empty:
+                fig.add_trace(go.Scatter(
+                    x=forecast_data["ds"],
+                    y=forecast_data["yhat"],
+                    mode="lines",
+                    name="예측 가격 (2025-2030)",
+                    line=dict(color="red", width=2, dash="dash")
+                ))
+                # 신뢰 구간 추가
+                fig.add_trace(go.Scatter(
+                    x=forecast_data["ds"],
+                    y=forecast_data["yhat_upper"],
+                    mode="lines",
+                    line=dict(width=0),
+                    showlegend=False
+                ))
+                fig.add_trace(go.Scatter(
+                    x=forecast_data["ds"],
+                    y=forecast_data["yhat_lower"],
+                    mode="lines",
+                    line=dict(width=0),
+                    fill="tonexty",
+                    fillcolor="rgba(255, 0, 0, 0.1)",
+                    name="95% 신뢰 구간"
+                ))
+            # 레이아웃 설정
+            fig.update_layout(
+                title=f"{selected_item} 장기 가격 예측 (1996-2030)",
+                xaxis_title="연도",
+                yaxis_title="가격 (원)",
+                legend=dict(
+                    orientation="h",
+                    yanchor="bottom",
+                    y=1.02,
+                    xanchor="right",
+                    x=1
+                )
+            )
+            # 차트 표시
+            st.plotly_chart(fig, use_container_width=True)
+            # 2030년 예측가 표시
+            try:
+                latest_price = macro_df.iloc[-1]["price"]
+                # 2030년 마지막 월 찾기
+                target_date = pd.Timestamp("2030-12-31")
+                close_dates = fc_macro.loc[(fc_macro["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
+                macro_pred = fc_macro.loc[fc_macro["ds"] == close_dates, "yhat"].iloc[0]
+                macro_pct = (macro_pred - latest_price) / latest_price * 100
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("현재 가격", format_currency(latest_price))
+                with col2:
+                    st.metric("2030년 예측가", format_currency(macro_pred), f"{macro_pct:+.1f}%")
+            except Exception as e:
+                st.error(f"예측가 계산 오류: {str(e)}")
         else:
             st.warning("예측 모델을 생성할 수 없습니다.")
             fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 과거 가격")
 # -------------------------------------------------
 # MICRO FORECAST 2024‑2026 ------------------------
 # -------------------------------------------------
+st.subheader("🔎 2024–2026 단기 예측 (월별)")
+# 데이터 필터링 - 최근 3년 데이터 활용
 try:
+    three_years_ago = pd.Timestamp("2021-01-01")
+    if item_df["date"].min() > three_years_ago:
+        three_years_ago = item_df["date"].min()
+    micro_df = item_df[item_df["date"] >= three_years_ago].copy()
 except Exception as e:
     st.error(f"단기 예측 데이터 필터링 오류: {str(e)}")
+    # 최근 데이터 사용
+    micro_df = item_df.sort_values("date").tail(24).copy()
 if len(micro_df) < 2:
+    st.warning(f"최근 데이터가 충분하지 않습니다.")
     fig = px.line(item_df, x="date", y="price", title=f"{selected_item} 최근 가격")
     st.plotly_chart(fig, use_container_width=True)
 else:
     try:
         with st.spinner("단기 예측 모델 생성 중..."):
+            # 월 단위 예측으로 변경
+            m_micro, fc_micro = fit_prophet(micro_df, MICRO_END, monthly=True, changepoint_prior_scale=0.05)
         if m_micro is not None and fc_micro is not None:
+            # 2024-01-01부터 2026-12-31까지 필터링
+            start_date = pd.Timestamp("2024-01-01")
+            end_date = pd.Timestamp("2026-12-31")
+            # 월별 데이터 준비
+            monthly_historical = micro_df.copy()
+            monthly_historical["year_month"] = monthly_historical["date"].dt.strftime("%Y-%m")
+            monthly_historical = monthly_historical.groupby("year_month").agg({
+                "date": "first",
+                "price": "mean"
+            }).reset_index(drop=True)
+            monthly_historical = monthly_historical[
+                (monthly_historical["date"] >= start_date) &
+                (monthly_historical["date"] <= end_date)
+            ]
+            monthly_forecast = fc_micro[
+                (fc_micro["ds"] >= start_date) &
+                (fc_micro["ds"] <= end_date)
+            ].copy()
+            # 월별 차트 생성
+            fig = go.Figure()
+            # 2024년 실제 데이터
+            actual_2024 = monthly_historical[
+                (monthly_historical["date"] >= pd.Timestamp("2024-01-01")) &
+                (monthly_historical["date"] <= pd.Timestamp("2024-12-31"))
+            ]
+            if not actual_2024.empty:
+                fig.add_trace(go.Scatter(
+                    x=actual_2024["date"],
+                    y=actual_2024["price"],
+                    mode="lines+markers",
+                    name="2024 실제 가격",
+                    line=dict(color="blue", width=2),
+                    marker=dict(size=8)
+                ))
+            # 2024년 이후 예측 데이터
+            cutoff = pd.Timestamp("2024-12-31")
+            future_data = monthly_forecast[monthly_forecast["ds"] > cutoff]
+            if not future_data.empty:
+                fig.add_trace(go.Scatter(
+                    x=future_data["ds"],
+                    y=future_data["yhat"],
+                    mode="lines+markers",
+                    name="2025-2026 예측 가격",
+                    line=dict(color="red", width=2, dash="dash"),
+                    marker=dict(size=8)
+                ))
+                # 신뢰 구간 추가
+                fig.add_trace(go.Scatter(
+                    x=future_data["ds"],
+                    y=future_data["yhat_upper"],
+                    mode="lines",
+                    line=dict(width=0),
+                    showlegend=False
+                ))
+                fig.add_trace(go.Scatter(
+                    x=future_data["ds"],
+                    y=future_data["yhat_lower"],
+                    mode="lines",
+                    line=dict(width=0),
+                    fill="tonexty",
+                    fillcolor="rgba(255, 0, 0, 0.1)",
+                    name="95% 신뢰 구간"
+                ))
+            # 레이아웃 설정
+            fig.update_layout(
+                title=f"{selected_item} 월별 단기 예측 (2024-2026)",
+                xaxis_title="월",
+                yaxis_title="가격 (원)",
+                xaxis=dict(
+                    tickformat="%Y-%m",
+                    dtick="M3",  # 3개월 간격
+                    tickangle=45
+                ),
+                legend=dict(
+                    orientation="h",
+                    yanchor="bottom",
+                    y=1.02,
+                    xanchor="right",
+                    x=1
+                )
+            )
+            # 차트 표시
+            st.plotly_chart(fig, use_container_width=True)
+            # 월별 예측 가격 표시 (2025-2026)
+            with st.expander("월별 예측 가격 상세보기"):
+                monthly_detail = monthly_forecast[monthly_forecast["ds"] > cutoff].copy()
+                monthly_detail["날짜"] = monthly_detail["ds"].dt.strftime("%Y년 %m월")
+                monthly_detail["예측가격"] = monthly_detail["yhat"].apply(format_currency)
+                monthly_detail["하한값"] = monthly_detail["yhat_lower"].apply(format_currency)
+                monthly_detail["상한값"] = monthly_detail["yhat_upper"].apply(format_currency)
+                st.dataframe(
+                    monthly_detail[["날짜", "예측가격", "하한값", "상한값"]],
+                    hide_index=True
+                )
+            # 2026년 예측가 표시
+            try:
+                latest_price = monthly_historical.iloc[-1]["price"] if not monthly_historical.empty else micro_df.iloc[-1]["price"]
+                # 2026년 마지막 월 찾기
+                target_date = pd.Timestamp("2026-12-31")
+                close_dates = monthly_forecast.loc[(monthly_forecast["ds"] - target_date).abs().argsort()[:1], "ds"].values[0]
+                micro_pred = monthly_forecast.loc[monthly_forecast["ds"] == close_dates, "yhat"].iloc[0]
+                micro_pct = (micro_pred - latest_price) / latest_price * 100
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("현재 가격", format_currency(latest_price))
+                with col2:
+                    st.metric("2026년 12월 예측가", format_currency(micro_pred), f"{micro_pct:+.1f}%")
+            except Exception as e:
+                st.error(f"예측가 계산 오류: {str(e)}")
         else:
             st.warning("단기 예측 모델을 생성할 수 없습니다.")
     except Exception as e:
                 f"**연간 피크 월:** {int(month_season.idxmax())}월  \n"
                 f"**연간 저점 월:** {int(month_season.idxmin())}월  \n"
                 f"**연간 변동폭:** {month_season.max() - month_season.min():.1f}")
+            # 월별 계절성 차트
+            month_names = ["1월", "2월", "3월", "4월", "5월", "6월", "7월", "8월", "9월", "10월", "11월", "12월"]
+            month_values = month_season.values
+            fig = px.bar(
+                x=month_names,
+                y=month_values,
+                title=f"{selected_item} 월별 가격 변동 패턴",
+                labels={"x": "월", "y": "상대적 가격 변동"}
+            )
+            st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
             st.error(f"시즈널리티 분석 오류: {str(e)}")
     else: