Spaces:

NCSOFT
/

ArenaLite

Sleeping

App Files Files Community

[email protected] commited on Aug 4

Commit

a88ccc4

1 Parent(s): ba378b3

95% CI added (rebuttal)

Browse files

Files changed (7) hide show

app.py +2 -0
streamlit_app_local/app.py +2 -0
streamlit_app_local/view_utils.py +42 -2
varco_arena/varco_arena_core/elo.py +41 -1
varco_arena/varco_arena_core/prompts/llmbar.py +1 -0
varco_arena/varco_arena_core/visualization.py +5 -4
view_utils.py +42 -2

app.py CHANGED Viewed

@@ -205,6 +205,7 @@ def run_varco_arena(
     last_update_time = time.time()
     terminal_output = st.empty()
     full_output = f"{command}\n"
     while True:
         # Check if we have output to read
         if select.select([process.stdout], [], [], 0)[0]:
@@ -229,6 +230,7 @@ def run_varco_arena(
         # Check if the process has finished
         if process.poll() is not None:
             # Read any remaining output
             remaining_output = process.stdout.read()
             if remaining_output:
                 lines = remaining_output.split("\n")

     last_update_time = time.time()
     terminal_output = st.empty()
     full_output = f"{command}\n"
+    to_show = full_output
     while True:
         # Check if we have output to read
         if select.select([process.stdout], [], [], 0)[0]:
         # Check if the process has finished
         if process.poll() is not None:
             # Read any remaining output
+            os.set_blocking(process.stdout.fileno(), True)
             remaining_output = process.stdout.read()
             if remaining_output:
                 lines = remaining_output.split("\n")

streamlit_app_local/app.py CHANGED Viewed

@@ -106,6 +106,7 @@ def run_varco_arena(
     last_update_time = time.time()
     terminal_output = st.empty()
     full_output = f"{command}\n"
     while True:
         # Check if we have output to read
         if select.select([process.stdout], [], [], 0)[0]:
@@ -130,6 +131,7 @@ def run_varco_arena(
         # Check if the process has finished
         if process.poll() is not None:
             # Read any remaining output
             remaining_output = process.stdout.read()
             if remaining_output:
                 lines = remaining_output.split("\n")

     last_update_time = time.time()
     terminal_output = st.empty()
     full_output = f"{command}\n"
+    to_show = full_output
     while True:
         # Check if we have output to read
         if select.select([process.stdout], [], [], 0)[0]:
         # Check if the process has finished
         if process.poll() is not None:
             # Read any remaining output
+            os.set_blocking(process.stdout.fileno(), True)
             remaining_output = process.stdout.read()
             if remaining_output:
                 lines = remaining_output.split("\n")

streamlit_app_local/view_utils.py CHANGED Viewed

@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
     return df
 def fill_missing_values(df, default_value=0):
     """
     This is used for completing pivot table
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
     figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
     # Elo Rating
-    elo = compute_mle_elo(results)
     elo_wr = compute_relative_winrate_to_1st(elo)
     # beautify
     elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
     elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
     elo_wr.index.name = "Rank"
-    figure_dict["elo_rating"] = elo_wr
     # Elo Rating by Task: Radar chart
     if is_overall:

     return df
+def compute_elo_with_ci(df, n_bootstrap=1000):
+    """
+    Compute ELO ratings with 95% confidence intervals using bootstrapping.
+    """
+    if isinstance(df, list):
+        df = pd.DataFrame(df)
+    bootstrap_elo_scores = []
+    for i in range(n_bootstrap):
+        # 복원추출로 샘플링
+        sample_df = df.sample(n=len(df), replace=True)
+        elo_scores = compute_mle_elo(sample_df)
+        elo_scores = elo_scores.set_index("Model")["Elo rating"]
+        bootstrap_elo_scores.append(elo_scores)
+    bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
+    # 신뢰구간 계산
+    ci_lower = bootstrap_df.quantile(0.025)
+    ci_upper = bootstrap_df.quantile(0.975)
+    # 원본 데이터로 ELO 점수 계산
+    main_elo_df = compute_mle_elo(df)
+    main_elo_df = main_elo_df.set_index("Model")
+    # 결과 합치기
+    result_df = main_elo_df.copy()
+    result_df["95% CI_lower"] = ci_lower
+    result_df["95% CI_upper"] = ci_upper
+    result_df = result_df.sort_values("Elo rating", ascending=False)
+    result_df = result_df.reset_index()
+    result_df.index = result_df.index + 1
+    return result_df
 def fill_missing_values(df, default_value=0):
     """
     This is used for completing pivot table
     figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
     # Elo Rating
+    elo = compute_elo_with_ci(results)
     elo_wr = compute_relative_winrate_to_1st(elo)
     # beautify
     elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
     elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
     elo_wr.index.name = "Rank"
+    elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
+    elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
+    figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
     # Elo Rating by Task: Radar chart
     if is_overall:

varco_arena/varco_arena_core/elo.py CHANGED Viewed

@@ -50,6 +50,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
         .reset_index(drop=True)
     )
     df.index = df.index + 1
     return df
@@ -69,7 +70,7 @@ def compute_relative_winrate_to_1st(elo_df):
     rating1st = elo_df["Elo rating"].max()
     win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
-    elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st)
     print(elo_df)
     return elo_df
@@ -80,3 +81,42 @@ def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float:
     rate_diff = rating_a - rating_b
     win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
     return win_rate

         .reset_index(drop=True)
     )
     df.index = df.index + 1
+    df["Elo rating"] = df["Elo rating"]
     return df
     rating1st = elo_df["Elo rating"].max()
     win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
+    elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3)
     print(elo_df)
     return elo_df
     rate_diff = rating_a - rating_b
     win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
     return win_rate
+def compute_elo_with_ci(df, n_bootstrap=1000):
+    """
+    Compute ELO ratings with 95% confidence intervals using bootstrapping.
+    """
+    if isinstance(df, list):
+        df = pd.DataFrame(df)
+    bootstrap_elo_scores = []
+    for i in range(n_bootstrap):
+        # 복원추출로 샘플링
+        sample_df = df.sample(n=len(df), replace=True)
+        elo_scores = compute_mle_elo(sample_df)
+        elo_scores = elo_scores.set_index("Model")["Elo rating"]
+        bootstrap_elo_scores.append(elo_scores)
+    bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
+    # 신뢰구간 계산
+    ci_lower = bootstrap_df.quantile(0.025).round(1)
+    ci_upper = bootstrap_df.quantile(0.975).round(1)
+    # 원본 데이터로 ELO 점수 계산
+    main_elo_df = compute_mle_elo(df)
+    main_elo_df = main_elo_df.set_index("Model")
+    # 결과 합치기
+    result_df = main_elo_df.copy()
+    result_df["95% CI_lower"] = ci_lower
+    result_df["95% CI_upper"] = ci_upper
+    result_df = result_df.sort_values("Elo rating", ascending=False)
+    result_df["Elo rating"] = result_df["Elo rating"].round(1)
+    result_df = result_df.reset_index()
+    result_df.index = result_df.index + 1
+    return result_df

varco_arena/varco_arena_core/prompts/llmbar.py CHANGED Viewed

@@ -155,6 +155,7 @@ class LLMBarPrompt(ComparisonPromptBase):
         out_b: str = None,
         task: Optional[str] = None,
         criteria_questions: Optional[str] = None,
     ) -> List[Dict]:
         if (criteria_questions and task) or criteria_questions is None and task is None:
             raise ValueError(

         out_b: str = None,
         task: Optional[str] = None,
         criteria_questions: Optional[str] = None,
+        **kwargs,
     ) -> List[Dict]:
         if (criteria_questions and task) or criteria_questions is None and task is None:
             raise ValueError(

varco_arena/varco_arena_core/visualization.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
-from .elo import compute_mle_elo
 def fill_missing_values(df, default_value=0):
@@ -137,7 +137,7 @@ def visualization(results, is_overall=False):
     figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
     # Elo Rating
-    elo = compute_mle_elo(results)
     char_width = 16
     header_char_width = 20
@@ -152,7 +152,7 @@ def visualization(results, is_overall=False):
         data=[
             go.Table(
                 header=dict(
-                    values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>"],
                     fill_color="paleturquoise",
                     align="left",
                     font=dict(size=16),
@@ -161,7 +161,8 @@ def visualization(results, is_overall=False):
                     values=[
                         list(range(1, len(elo) + 1)),
                         elo["Model"],
-                        elo["Elo rating"],
                     ],
                     align="left",
                     font=dict(size=16),

 import plotly.express as px
 import plotly.graph_objects as go
+from .elo import compute_elo_with_ci, compute_mle_elo
 def fill_missing_values(df, default_value=0):
     figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
     # Elo Rating
+    elo = compute_elo_with_ci(results)
     char_width = 16
     header_char_width = 20
         data=[
             go.Table(
                 header=dict(
+                    values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>", "<b>95% CI</b>"],
                     fill_color="paleturquoise",
                     align="left",
                     font=dict(size=16),
                     values=[
                         list(range(1, len(elo) + 1)),
                         elo["Model"],
+                        elo["Elo rating"].round(2),
+                        [f"{lower:.2f} - {upper:.2f}" for lower, upper in zip(elo['95% CI_lower'], elo['95% CI_upper'])]
                     ],
                     align="left",
                     font=dict(size=16),

view_utils.py CHANGED Viewed

@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
     return df
 def fill_missing_values(df, default_value=0):
     """
     This is used for completing pivot table
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
     figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
     # Elo Rating
-    elo = compute_mle_elo(results)
     elo_wr = compute_relative_winrate_to_1st(elo)
     # beautify
     elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
     elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
     elo_wr.index.name = "Rank"
-    figure_dict["elo_rating"] = elo_wr
     # Elo Rating by Task: Radar chart
     if is_overall:

     return df
+def compute_elo_with_ci(df, n_bootstrap=1000):
+    """
+    Compute ELO ratings with 95% confidence intervals using bootstrapping.
+    """
+    if isinstance(df, list):
+        df = pd.DataFrame(df)
+    bootstrap_elo_scores = []
+    for i in range(n_bootstrap):
+        # 복원추출로 샘플링
+        sample_df = df.sample(n=len(df), replace=True)
+        elo_scores = compute_mle_elo(sample_df)
+        elo_scores = elo_scores.set_index("Model")["Elo rating"]
+        bootstrap_elo_scores.append(elo_scores)
+    bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
+    # 신뢰구간 계산
+    ci_lower = bootstrap_df.quantile(0.025)
+    ci_upper = bootstrap_df.quantile(0.975)
+    # 원본 데이터로 ELO 점수 계산
+    main_elo_df = compute_mle_elo(df)
+    main_elo_df = main_elo_df.set_index("Model")
+    # 결과 합치기
+    result_df = main_elo_df.copy()
+    result_df["95% CI_lower"] = ci_lower
+    result_df["95% CI_upper"] = ci_upper
+    result_df = result_df.sort_values("Elo rating", ascending=False)
+    result_df = result_df.reset_index()
+    result_df.index = result_df.index + 1
+    return result_df
 def fill_missing_values(df, default_value=0):
     """
     This is used for completing pivot table
     figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
     # Elo Rating
+    elo = compute_elo_with_ci(results)
     elo_wr = compute_relative_winrate_to_1st(elo)
     # beautify
     elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
     elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
     elo_wr.index.name = "Rank"
+    elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
+    elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
+    figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
     # Elo Rating by Task: Radar chart
     if is_overall: