Spaces:
Sleeping
Sleeping
[email protected]
commited on
Commit
·
a88ccc4
1
Parent(s):
ba378b3
95% CI added (rebuttal)
Browse files- app.py +2 -0
- streamlit_app_local/app.py +2 -0
- streamlit_app_local/view_utils.py +42 -2
- varco_arena/varco_arena_core/elo.py +41 -1
- varco_arena/varco_arena_core/prompts/llmbar.py +1 -0
- varco_arena/varco_arena_core/visualization.py +5 -4
- view_utils.py +42 -2
app.py
CHANGED
|
@@ -205,6 +205,7 @@ def run_varco_arena(
|
|
| 205 |
last_update_time = time.time()
|
| 206 |
terminal_output = st.empty()
|
| 207 |
full_output = f"{command}\n"
|
|
|
|
| 208 |
while True:
|
| 209 |
# Check if we have output to read
|
| 210 |
if select.select([process.stdout], [], [], 0)[0]:
|
|
@@ -229,6 +230,7 @@ def run_varco_arena(
|
|
| 229 |
# Check if the process has finished
|
| 230 |
if process.poll() is not None:
|
| 231 |
# Read any remaining output
|
|
|
|
| 232 |
remaining_output = process.stdout.read()
|
| 233 |
if remaining_output:
|
| 234 |
lines = remaining_output.split("\n")
|
|
|
|
| 205 |
last_update_time = time.time()
|
| 206 |
terminal_output = st.empty()
|
| 207 |
full_output = f"{command}\n"
|
| 208 |
+
to_show = full_output
|
| 209 |
while True:
|
| 210 |
# Check if we have output to read
|
| 211 |
if select.select([process.stdout], [], [], 0)[0]:
|
|
|
|
| 230 |
# Check if the process has finished
|
| 231 |
if process.poll() is not None:
|
| 232 |
# Read any remaining output
|
| 233 |
+
os.set_blocking(process.stdout.fileno(), True)
|
| 234 |
remaining_output = process.stdout.read()
|
| 235 |
if remaining_output:
|
| 236 |
lines = remaining_output.split("\n")
|
streamlit_app_local/app.py
CHANGED
|
@@ -106,6 +106,7 @@ def run_varco_arena(
|
|
| 106 |
last_update_time = time.time()
|
| 107 |
terminal_output = st.empty()
|
| 108 |
full_output = f"{command}\n"
|
|
|
|
| 109 |
while True:
|
| 110 |
# Check if we have output to read
|
| 111 |
if select.select([process.stdout], [], [], 0)[0]:
|
|
@@ -130,6 +131,7 @@ def run_varco_arena(
|
|
| 130 |
# Check if the process has finished
|
| 131 |
if process.poll() is not None:
|
| 132 |
# Read any remaining output
|
|
|
|
| 133 |
remaining_output = process.stdout.read()
|
| 134 |
if remaining_output:
|
| 135 |
lines = remaining_output.split("\n")
|
|
|
|
| 106 |
last_update_time = time.time()
|
| 107 |
terminal_output = st.empty()
|
| 108 |
full_output = f"{command}\n"
|
| 109 |
+
to_show = full_output
|
| 110 |
while True:
|
| 111 |
# Check if we have output to read
|
| 112 |
if select.select([process.stdout], [], [], 0)[0]:
|
|
|
|
| 131 |
# Check if the process has finished
|
| 132 |
if process.poll() is not None:
|
| 133 |
# Read any remaining output
|
| 134 |
+
os.set_blocking(process.stdout.fileno(), True)
|
| 135 |
remaining_output = process.stdout.read()
|
| 136 |
if remaining_output:
|
| 137 |
lines = remaining_output.split("\n")
|
streamlit_app_local/view_utils.py
CHANGED
|
@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
| 154 |
return df
|
| 155 |
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
def fill_missing_values(df, default_value=0):
|
| 158 |
"""
|
| 159 |
This is used for completing pivot table
|
|
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
|
|
| 378 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
| 379 |
|
| 380 |
# Elo Rating
|
| 381 |
-
elo =
|
| 382 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
| 383 |
# beautify
|
| 384 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
| 385 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
| 386 |
elo_wr.index.name = "Rank"
|
|
|
|
|
|
|
| 387 |
|
| 388 |
-
figure_dict["elo_rating"] = elo_wr
|
| 389 |
|
| 390 |
# Elo Rating by Task: Radar chart
|
| 391 |
if is_overall:
|
|
|
|
| 154 |
return df
|
| 155 |
|
| 156 |
|
| 157 |
+
def compute_elo_with_ci(df, n_bootstrap=1000):
|
| 158 |
+
"""
|
| 159 |
+
Compute ELO ratings with 95% confidence intervals using bootstrapping.
|
| 160 |
+
"""
|
| 161 |
+
if isinstance(df, list):
|
| 162 |
+
df = pd.DataFrame(df)
|
| 163 |
+
|
| 164 |
+
bootstrap_elo_scores = []
|
| 165 |
+
for i in range(n_bootstrap):
|
| 166 |
+
# 복원추출로 샘플링
|
| 167 |
+
sample_df = df.sample(n=len(df), replace=True)
|
| 168 |
+
elo_scores = compute_mle_elo(sample_df)
|
| 169 |
+
elo_scores = elo_scores.set_index("Model")["Elo rating"]
|
| 170 |
+
bootstrap_elo_scores.append(elo_scores)
|
| 171 |
+
|
| 172 |
+
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
|
| 173 |
+
|
| 174 |
+
# 신뢰구간 계산
|
| 175 |
+
ci_lower = bootstrap_df.quantile(0.025)
|
| 176 |
+
ci_upper = bootstrap_df.quantile(0.975)
|
| 177 |
+
|
| 178 |
+
# 원본 데이터로 ELO 점수 계산
|
| 179 |
+
main_elo_df = compute_mle_elo(df)
|
| 180 |
+
main_elo_df = main_elo_df.set_index("Model")
|
| 181 |
+
|
| 182 |
+
# 결과 합치기
|
| 183 |
+
result_df = main_elo_df.copy()
|
| 184 |
+
result_df["95% CI_lower"] = ci_lower
|
| 185 |
+
result_df["95% CI_upper"] = ci_upper
|
| 186 |
+
|
| 187 |
+
result_df = result_df.sort_values("Elo rating", ascending=False)
|
| 188 |
+
result_df = result_df.reset_index()
|
| 189 |
+
result_df.index = result_df.index + 1
|
| 190 |
+
|
| 191 |
+
return result_df
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
def fill_missing_values(df, default_value=0):
|
| 196 |
"""
|
| 197 |
This is used for completing pivot table
|
|
|
|
| 416 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
| 417 |
|
| 418 |
# Elo Rating
|
| 419 |
+
elo = compute_elo_with_ci(results)
|
| 420 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
| 421 |
# beautify
|
| 422 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
| 423 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
| 424 |
elo_wr.index.name = "Rank"
|
| 425 |
+
elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
|
| 426 |
+
elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
|
| 427 |
|
| 428 |
+
figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
|
| 429 |
|
| 430 |
# Elo Rating by Task: Radar chart
|
| 431 |
if is_overall:
|
varco_arena/varco_arena_core/elo.py
CHANGED
|
@@ -50,6 +50,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
| 50 |
.reset_index(drop=True)
|
| 51 |
)
|
| 52 |
df.index = df.index + 1
|
|
|
|
| 53 |
|
| 54 |
return df
|
| 55 |
|
|
@@ -69,7 +70,7 @@ def compute_relative_winrate_to_1st(elo_df):
|
|
| 69 |
|
| 70 |
rating1st = elo_df["Elo rating"].max()
|
| 71 |
win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
|
| 72 |
-
elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st)
|
| 73 |
print(elo_df)
|
| 74 |
|
| 75 |
return elo_df
|
|
@@ -80,3 +81,42 @@ def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float:
|
|
| 80 |
rate_diff = rating_a - rating_b
|
| 81 |
win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
|
| 82 |
return win_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
.reset_index(drop=True)
|
| 51 |
)
|
| 52 |
df.index = df.index + 1
|
| 53 |
+
df["Elo rating"] = df["Elo rating"]
|
| 54 |
|
| 55 |
return df
|
| 56 |
|
|
|
|
| 70 |
|
| 71 |
rating1st = elo_df["Elo rating"].max()
|
| 72 |
win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
|
| 73 |
+
elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3)
|
| 74 |
print(elo_df)
|
| 75 |
|
| 76 |
return elo_df
|
|
|
|
| 81 |
rate_diff = rating_a - rating_b
|
| 82 |
win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
|
| 83 |
return win_rate
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def compute_elo_with_ci(df, n_bootstrap=1000):
|
| 87 |
+
"""
|
| 88 |
+
Compute ELO ratings with 95% confidence intervals using bootstrapping.
|
| 89 |
+
"""
|
| 90 |
+
if isinstance(df, list):
|
| 91 |
+
df = pd.DataFrame(df)
|
| 92 |
+
|
| 93 |
+
bootstrap_elo_scores = []
|
| 94 |
+
for i in range(n_bootstrap):
|
| 95 |
+
# 복원추출로 샘플링
|
| 96 |
+
sample_df = df.sample(n=len(df), replace=True)
|
| 97 |
+
elo_scores = compute_mle_elo(sample_df)
|
| 98 |
+
elo_scores = elo_scores.set_index("Model")["Elo rating"]
|
| 99 |
+
bootstrap_elo_scores.append(elo_scores)
|
| 100 |
+
|
| 101 |
+
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
|
| 102 |
+
|
| 103 |
+
# 신뢰구간 계산
|
| 104 |
+
ci_lower = bootstrap_df.quantile(0.025).round(1)
|
| 105 |
+
ci_upper = bootstrap_df.quantile(0.975).round(1)
|
| 106 |
+
|
| 107 |
+
# 원본 데이터로 ELO 점수 계산
|
| 108 |
+
main_elo_df = compute_mle_elo(df)
|
| 109 |
+
main_elo_df = main_elo_df.set_index("Model")
|
| 110 |
+
|
| 111 |
+
# 결과 합치기
|
| 112 |
+
result_df = main_elo_df.copy()
|
| 113 |
+
|
| 114 |
+
result_df["95% CI_lower"] = ci_lower
|
| 115 |
+
result_df["95% CI_upper"] = ci_upper
|
| 116 |
+
|
| 117 |
+
result_df = result_df.sort_values("Elo rating", ascending=False)
|
| 118 |
+
result_df["Elo rating"] = result_df["Elo rating"].round(1)
|
| 119 |
+
result_df = result_df.reset_index()
|
| 120 |
+
result_df.index = result_df.index + 1
|
| 121 |
+
|
| 122 |
+
return result_df
|
varco_arena/varco_arena_core/prompts/llmbar.py
CHANGED
|
@@ -155,6 +155,7 @@ class LLMBarPrompt(ComparisonPromptBase):
|
|
| 155 |
out_b: str = None,
|
| 156 |
task: Optional[str] = None,
|
| 157 |
criteria_questions: Optional[str] = None,
|
|
|
|
| 158 |
) -> List[Dict]:
|
| 159 |
if (criteria_questions and task) or criteria_questions is None and task is None:
|
| 160 |
raise ValueError(
|
|
|
|
| 155 |
out_b: str = None,
|
| 156 |
task: Optional[str] = None,
|
| 157 |
criteria_questions: Optional[str] = None,
|
| 158 |
+
**kwargs,
|
| 159 |
) -> List[Dict]:
|
| 160 |
if (criteria_questions and task) or criteria_questions is None and task is None:
|
| 161 |
raise ValueError(
|
varco_arena/varco_arena_core/visualization.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
| 2 |
import plotly.express as px
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
|
| 5 |
-
from .elo import compute_mle_elo
|
| 6 |
|
| 7 |
|
| 8 |
def fill_missing_values(df, default_value=0):
|
|
@@ -137,7 +137,7 @@ def visualization(results, is_overall=False):
|
|
| 137 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
| 138 |
|
| 139 |
# Elo Rating
|
| 140 |
-
elo =
|
| 141 |
|
| 142 |
char_width = 16
|
| 143 |
header_char_width = 20
|
|
@@ -152,7 +152,7 @@ def visualization(results, is_overall=False):
|
|
| 152 |
data=[
|
| 153 |
go.Table(
|
| 154 |
header=dict(
|
| 155 |
-
values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>"],
|
| 156 |
fill_color="paleturquoise",
|
| 157 |
align="left",
|
| 158 |
font=dict(size=16),
|
|
@@ -161,7 +161,8 @@ def visualization(results, is_overall=False):
|
|
| 161 |
values=[
|
| 162 |
list(range(1, len(elo) + 1)),
|
| 163 |
elo["Model"],
|
| 164 |
-
elo["Elo rating"],
|
|
|
|
| 165 |
],
|
| 166 |
align="left",
|
| 167 |
font=dict(size=16),
|
|
|
|
| 2 |
import plotly.express as px
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
|
| 5 |
+
from .elo import compute_elo_with_ci, compute_mle_elo
|
| 6 |
|
| 7 |
|
| 8 |
def fill_missing_values(df, default_value=0):
|
|
|
|
| 137 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
| 138 |
|
| 139 |
# Elo Rating
|
| 140 |
+
elo = compute_elo_with_ci(results)
|
| 141 |
|
| 142 |
char_width = 16
|
| 143 |
header_char_width = 20
|
|
|
|
| 152 |
data=[
|
| 153 |
go.Table(
|
| 154 |
header=dict(
|
| 155 |
+
values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>", "<b>95% CI</b>"],
|
| 156 |
fill_color="paleturquoise",
|
| 157 |
align="left",
|
| 158 |
font=dict(size=16),
|
|
|
|
| 161 |
values=[
|
| 162 |
list(range(1, len(elo) + 1)),
|
| 163 |
elo["Model"],
|
| 164 |
+
elo["Elo rating"].round(2),
|
| 165 |
+
[f"{lower:.2f} - {upper:.2f}" for lower, upper in zip(elo['95% CI_lower'], elo['95% CI_upper'])]
|
| 166 |
],
|
| 167 |
align="left",
|
| 168 |
font=dict(size=16),
|
view_utils.py
CHANGED
|
@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
| 154 |
return df
|
| 155 |
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
def fill_missing_values(df, default_value=0):
|
| 158 |
"""
|
| 159 |
This is used for completing pivot table
|
|
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
|
|
| 378 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
| 379 |
|
| 380 |
# Elo Rating
|
| 381 |
-
elo =
|
| 382 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
| 383 |
# beautify
|
| 384 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
| 385 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
| 386 |
elo_wr.index.name = "Rank"
|
|
|
|
|
|
|
| 387 |
|
| 388 |
-
figure_dict["elo_rating"] = elo_wr
|
| 389 |
|
| 390 |
# Elo Rating by Task: Radar chart
|
| 391 |
if is_overall:
|
|
|
|
| 154 |
return df
|
| 155 |
|
| 156 |
|
| 157 |
+
def compute_elo_with_ci(df, n_bootstrap=1000):
|
| 158 |
+
"""
|
| 159 |
+
Compute ELO ratings with 95% confidence intervals using bootstrapping.
|
| 160 |
+
"""
|
| 161 |
+
if isinstance(df, list):
|
| 162 |
+
df = pd.DataFrame(df)
|
| 163 |
+
|
| 164 |
+
bootstrap_elo_scores = []
|
| 165 |
+
for i in range(n_bootstrap):
|
| 166 |
+
# 복원추출로 샘플링
|
| 167 |
+
sample_df = df.sample(n=len(df), replace=True)
|
| 168 |
+
elo_scores = compute_mle_elo(sample_df)
|
| 169 |
+
elo_scores = elo_scores.set_index("Model")["Elo rating"]
|
| 170 |
+
bootstrap_elo_scores.append(elo_scores)
|
| 171 |
+
|
| 172 |
+
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
|
| 173 |
+
|
| 174 |
+
# 신뢰구간 계산
|
| 175 |
+
ci_lower = bootstrap_df.quantile(0.025)
|
| 176 |
+
ci_upper = bootstrap_df.quantile(0.975)
|
| 177 |
+
|
| 178 |
+
# 원본 데이터로 ELO 점수 계산
|
| 179 |
+
main_elo_df = compute_mle_elo(df)
|
| 180 |
+
main_elo_df = main_elo_df.set_index("Model")
|
| 181 |
+
|
| 182 |
+
# 결과 합치기
|
| 183 |
+
result_df = main_elo_df.copy()
|
| 184 |
+
result_df["95% CI_lower"] = ci_lower
|
| 185 |
+
result_df["95% CI_upper"] = ci_upper
|
| 186 |
+
|
| 187 |
+
result_df = result_df.sort_values("Elo rating", ascending=False)
|
| 188 |
+
result_df = result_df.reset_index()
|
| 189 |
+
result_df.index = result_df.index + 1
|
| 190 |
+
|
| 191 |
+
return result_df
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
def fill_missing_values(df, default_value=0):
|
| 196 |
"""
|
| 197 |
This is used for completing pivot table
|
|
|
|
| 416 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
| 417 |
|
| 418 |
# Elo Rating
|
| 419 |
+
elo = compute_elo_with_ci(results)
|
| 420 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
| 421 |
# beautify
|
| 422 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
| 423 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
| 424 |
elo_wr.index.name = "Rank"
|
| 425 |
+
elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
|
| 426 |
+
elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
|
| 427 |
|
| 428 |
+
figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
|
| 429 |
|
| 430 |
# Elo Rating by Task: Radar chart
|
| 431 |
if is_overall:
|