Spaces:
Runtime error
Runtime error
cyx96
commited on
Commit
Β·
3193aca
1
Parent(s):
f10e39e
added phi4
Browse files- .gitignore +1 -0
- app.py +19 -54
- main_backend.py +17 -19
- requirements.txt +3 -2
- src/backend/evaluate_model.py +29 -14
- src/backend/manage_requests.py +3 -4
- src/backend/model_operations.py +50 -56
- src/backend/run_eval_suite.py +17 -18
- src/display/about.py +2 -4
- src/display/utils.py +11 -5
- src/envs.py +7 -2
- src/leaderboard/read_evals.py +3 -4
.gitignore
CHANGED
@@ -17,6 +17,7 @@ eval-results-bk_hhem21/
|
|
17 |
eval-results_hhem21/
|
18 |
hhem21_server/
|
19 |
leaderboard_results/
|
|
|
20 |
|
21 |
src/assets/model_counts.html
|
22 |
|
|
|
17 |
eval-results_hhem21/
|
18 |
hhem21_server/
|
19 |
leaderboard_results/
|
20 |
+
leaderboard-bk/
|
21 |
|
22 |
src/assets/model_counts.html
|
23 |
|
app.py
CHANGED
@@ -21,6 +21,13 @@ try:
|
|
21 |
)
|
22 |
except Exception:
|
23 |
restart_space()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
try:
|
25 |
print(envs.EVAL_RESULTS_PATH)
|
26 |
snapshot_download(
|
@@ -32,24 +39,15 @@ except Exception:
|
|
32 |
raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
|
33 |
leaderboard_df = original_df.copy()
|
34 |
|
35 |
-
(
|
36 |
-
finished_eval_queue_df,
|
37 |
-
running_eval_queue_df,
|
38 |
-
pending_eval_queue_df,
|
39 |
-
) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
|
40 |
-
|
41 |
-
|
42 |
# Searching and filtering
|
43 |
def update_table(
|
44 |
hidden_df: pd.DataFrame,
|
45 |
columns: list,
|
46 |
type_query: list,
|
47 |
-
precision_query: str,
|
48 |
-
size_query: list,
|
49 |
-
show_deleted: bool,
|
50 |
query: str,
|
51 |
):
|
52 |
-
|
|
|
53 |
filtered_df = filter_queries(query, filtered_df)
|
54 |
df = select_columns(filtered_df, columns)
|
55 |
return df
|
@@ -83,32 +81,23 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
83 |
final_df.append(temp_filtered_df)
|
84 |
if len(final_df) > 0:
|
85 |
filtered_df = pd.concat(final_df)
|
86 |
-
filtered_df = filtered_df.drop_duplicates(
|
87 |
-
|
88 |
-
)
|
89 |
|
90 |
return filtered_df
|
91 |
|
92 |
|
93 |
-
def filter_models(
|
94 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
95 |
-
) -> pd.DataFrame:
|
96 |
-
# Show all models
|
97 |
-
# if show_deleted:
|
98 |
-
# filtered_df = df
|
99 |
-
# else: # Show only still on the hub models
|
100 |
-
# filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
|
101 |
-
|
102 |
filtered_df = df
|
103 |
|
104 |
type_emoji = [t[0] for t in type_query]
|
105 |
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
106 |
-
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
107 |
|
108 |
-
numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
|
109 |
-
params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
|
110 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
111 |
-
filtered_df = filtered_df.loc[mask]
|
112 |
|
113 |
return filtered_df
|
114 |
|
@@ -148,12 +137,8 @@ with demo:
|
|
148 |
elem_id="column-select",
|
149 |
interactive=True,
|
150 |
)
|
151 |
-
|
152 |
-
# deleted_models_visibility = gr.Checkbox(
|
153 |
-
# value=False, label="Show gated/private/deleted models", interactive=True
|
154 |
-
# )
|
155 |
with gr.Column(min_width=320):
|
156 |
-
#with gr.Box(elem_id="box-filter"):
|
157 |
filter_columns_type = gr.CheckboxGroup(
|
158 |
label="Model types",
|
159 |
choices=[t.to_str() for t in utils.ModelType],
|
@@ -161,20 +146,6 @@ with demo:
|
|
161 |
interactive=True,
|
162 |
elem_id="filter-columns-type",
|
163 |
)
|
164 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
165 |
-
# label="Precision",
|
166 |
-
# choices=[i.value.name for i in utils.Precision],
|
167 |
-
# value=[i.value.name for i in utils.Precision],
|
168 |
-
# interactive=True,
|
169 |
-
# elem_id="filter-columns-precision",
|
170 |
-
# )
|
171 |
-
# filter_columns_size = gr.CheckboxGroup(
|
172 |
-
# label="Model sizes (in billions of parameters)",
|
173 |
-
# choices=list(utils.NUMERIC_INTERVALS.keys()),
|
174 |
-
# value=list(utils.NUMERIC_INTERVALS.keys()),
|
175 |
-
# interactive=True,
|
176 |
-
# elem_id="filter-columns-size",
|
177 |
-
# )
|
178 |
|
179 |
leaderboard_table = gr.components.Dataframe(
|
180 |
value=leaderboard_df[
|
@@ -203,23 +174,17 @@ with demo:
|
|
203 |
hidden_leaderboard_table_for_search,
|
204 |
shown_columns,
|
205 |
filter_columns_type,
|
206 |
-
# filter_columns_precision,
|
207 |
-
# filter_columns_size,
|
208 |
-
# deleted_models_visibility,
|
209 |
search_bar,
|
210 |
],
|
211 |
leaderboard_table,
|
212 |
)
|
213 |
-
for selector in [shown_columns, filter_columns_type]:
|
214 |
selector.change(
|
215 |
update_table,
|
216 |
[
|
217 |
hidden_leaderboard_table_for_search,
|
218 |
shown_columns,
|
219 |
filter_columns_type,
|
220 |
-
# filter_columns_precision,
|
221 |
-
# filter_columns_size,
|
222 |
-
# deleted_models_visibility,
|
223 |
search_bar,
|
224 |
],
|
225 |
leaderboard_table,
|
|
|
21 |
)
|
22 |
except Exception:
|
23 |
restart_space()
|
24 |
+
|
25 |
+
(
|
26 |
+
finished_eval_queue_df,
|
27 |
+
running_eval_queue_df,
|
28 |
+
pending_eval_queue_df,
|
29 |
+
) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
|
30 |
+
|
31 |
try:
|
32 |
print(envs.EVAL_RESULTS_PATH)
|
33 |
snapshot_download(
|
|
|
39 |
raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
|
40 |
leaderboard_df = original_df.copy()
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Searching and filtering
|
43 |
def update_table(
|
44 |
hidden_df: pd.DataFrame,
|
45 |
columns: list,
|
46 |
type_query: list,
|
|
|
|
|
|
|
47 |
query: str,
|
48 |
):
|
49 |
+
print(f"filter: columns={columns}, type_query={type_query}, query={query}")
|
50 |
+
filtered_df = filter_models(hidden_df, type_query)
|
51 |
filtered_df = filter_queries(query, filtered_df)
|
52 |
df = select_columns(filtered_df, columns)
|
53 |
return df
|
|
|
81 |
final_df.append(temp_filtered_df)
|
82 |
if len(final_df) > 0:
|
83 |
filtered_df = pd.concat(final_df)
|
84 |
+
#filtered_df = filtered_df.drop_duplicates(subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name])
|
85 |
+
filtered_df = filtered_df.drop_duplicates(subset=[utils.AutoEvalColumn.model.name])
|
|
|
86 |
|
87 |
return filtered_df
|
88 |
|
89 |
|
90 |
+
def filter_models(df: pd.DataFrame, type_query: list) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
filtered_df = df
|
92 |
|
93 |
type_emoji = [t[0] for t in type_query]
|
94 |
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
95 |
+
# filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
96 |
|
97 |
+
# numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
|
98 |
+
# params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
|
99 |
+
# mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
100 |
+
# filtered_df = filtered_df.loc[mask]
|
101 |
|
102 |
return filtered_df
|
103 |
|
|
|
137 |
elem_id="column-select",
|
138 |
interactive=True,
|
139 |
)
|
140 |
+
|
|
|
|
|
|
|
141 |
with gr.Column(min_width=320):
|
|
|
142 |
filter_columns_type = gr.CheckboxGroup(
|
143 |
label="Model types",
|
144 |
choices=[t.to_str() for t in utils.ModelType],
|
|
|
146 |
interactive=True,
|
147 |
elem_id="filter-columns-type",
|
148 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
leaderboard_table = gr.components.Dataframe(
|
151 |
value=leaderboard_df[
|
|
|
174 |
hidden_leaderboard_table_for_search,
|
175 |
shown_columns,
|
176 |
filter_columns_type,
|
|
|
|
|
|
|
177 |
search_bar,
|
178 |
],
|
179 |
leaderboard_table,
|
180 |
)
|
181 |
+
for selector in [shown_columns, filter_columns_type]:
|
182 |
selector.change(
|
183 |
update_table,
|
184 |
[
|
185 |
hidden_leaderboard_table_for_search,
|
186 |
shown_columns,
|
187 |
filter_columns_type,
|
|
|
|
|
|
|
188 |
search_bar,
|
189 |
],
|
190 |
leaderboard_table,
|
main_backend.py
CHANGED
@@ -20,10 +20,8 @@ RUNNING_STATUS = "RUNNING"
|
|
20 |
FINISHED_STATUS = "FINISHED"
|
21 |
FAILED_STATUS = "FAILED"
|
22 |
|
23 |
-
snapshot_download(repo_id=envs.RESULTS_REPO, revision="main",
|
24 |
-
|
25 |
-
snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
|
26 |
-
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
27 |
|
28 |
|
29 |
def run_auto_eval(args):
|
@@ -41,9 +39,9 @@ def run_auto_eval(args):
|
|
41 |
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
42 |
)
|
43 |
logging.info("Checked completed evals")
|
44 |
-
eval_requests = manage_requests.get_eval_requests(
|
45 |
-
|
46 |
-
|
47 |
logging.info("Got eval requests")
|
48 |
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
49 |
logging.info("Sorted eval requests")
|
@@ -65,20 +63,11 @@ def run_auto_eval(args):
|
|
65 |
eval_request = eval_requests[0]
|
66 |
pp.pprint(eval_request)
|
67 |
|
68 |
-
# manage_requests.set_eval_request(
|
69 |
-
# api=envs.API,
|
70 |
-
# eval_request=eval_request,
|
71 |
-
# new_status=RUNNING_STATUS,
|
72 |
-
# hf_repo=envs.QUEUE_REPO,
|
73 |
-
# local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
74 |
-
# )
|
75 |
-
# logging.info("Set eval request to running, now running eval")
|
76 |
-
|
77 |
run_eval_suite.run_evaluation(
|
78 |
eval_request=eval_request,
|
79 |
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
80 |
results_repo=envs.RESULTS_REPO,
|
81 |
-
batch_size=
|
82 |
device=envs.DEVICE,
|
83 |
no_cache=True,
|
84 |
need_check=not args.publish,
|
@@ -88,6 +77,7 @@ def run_auto_eval(args):
|
|
88 |
else:
|
89 |
eval_request = manage_requests.EvalRequest(
|
90 |
model=args.model,
|
|
|
91 |
status=PENDING_STATUS,
|
92 |
precision=args.precision
|
93 |
)
|
@@ -98,10 +88,13 @@ def run_auto_eval(args):
|
|
98 |
eval_request=eval_request,
|
99 |
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
100 |
results_repo=envs.RESULTS_REPO,
|
101 |
-
batch_size=
|
102 |
device=envs.DEVICE,
|
103 |
need_check=not args.publish,
|
104 |
-
write_results=args.update
|
|
|
|
|
|
|
105 |
)
|
106 |
logging.info("Reproducibility eval finished")
|
107 |
|
@@ -112,9 +105,14 @@ def main():
|
|
112 |
# Optional arguments
|
113 |
parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
|
114 |
parser.add_argument("--model", type=str, default=None, help="Your Model ID")
|
|
|
115 |
parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
|
116 |
parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
|
117 |
parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
|
|
|
|
|
|
|
|
|
118 |
|
119 |
args = parser.parse_args()
|
120 |
|
|
|
20 |
FINISHED_STATUS = "FINISHED"
|
21 |
FAILED_STATUS = "FAILED"
|
22 |
|
23 |
+
snapshot_download(repo_id=envs.RESULTS_REPO, revision="main", local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
24 |
+
snapshot_download(repo_id=envs.QUEUE_REPO, revision="main", local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
|
|
|
|
25 |
|
26 |
|
27 |
def run_auto_eval(args):
|
|
|
39 |
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
40 |
)
|
41 |
logging.info("Checked completed evals")
|
42 |
+
eval_requests = manage_requests.get_eval_requests(
|
43 |
+
job_status=current_pending_status, hf_repo=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
44 |
+
)
|
45 |
logging.info("Got eval requests")
|
46 |
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
47 |
logging.info("Sorted eval requests")
|
|
|
63 |
eval_request = eval_requests[0]
|
64 |
pp.pprint(eval_request)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
run_eval_suite.run_evaluation(
|
67 |
eval_request=eval_request,
|
68 |
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
69 |
results_repo=envs.RESULTS_REPO,
|
70 |
+
batch_size=args.batch_size,
|
71 |
device=envs.DEVICE,
|
72 |
no_cache=True,
|
73 |
need_check=not args.publish,
|
|
|
77 |
else:
|
78 |
eval_request = manage_requests.EvalRequest(
|
79 |
model=args.model,
|
80 |
+
model_path=args.model_path,
|
81 |
status=PENDING_STATUS,
|
82 |
precision=args.precision
|
83 |
)
|
|
|
88 |
eval_request=eval_request,
|
89 |
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
90 |
results_repo=envs.RESULTS_REPO,
|
91 |
+
batch_size=args.batch_size,
|
92 |
device=envs.DEVICE,
|
93 |
need_check=not args.publish,
|
94 |
+
write_results=args.update,
|
95 |
+
limit=args.limit,
|
96 |
+
use_vllm=args.use_vllm,
|
97 |
+
tensor_parallel_size=args.tensor_parallel_size,
|
98 |
)
|
99 |
logging.info("Reproducibility eval finished")
|
100 |
|
|
|
105 |
# Optional arguments
|
106 |
parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
|
107 |
parser.add_argument("--model", type=str, default=None, help="Your Model ID")
|
108 |
+
parser.add_argument("--model_path", type=str, default=None, help="Full path of model")
|
109 |
parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
|
110 |
parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
|
111 |
parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
|
112 |
+
parser.add_argument("--limit", type=int, default=None, help="Limit on the number of items to process")
|
113 |
+
parser.add_argument("--use_vllm", type=bool, default=False, help="Whether to infer with vllm or not")
|
114 |
+
parser.add_argument("--tensor_parallel_size", type=int, default=1)
|
115 |
+
parser.add_argument("--batch_size", type=int, default=1)
|
116 |
|
117 |
args = parser.parse_args()
|
118 |
|
requirements.txt
CHANGED
@@ -27,6 +27,7 @@ google-cloud-aiplatform>=1.38
|
|
27 |
qwen-vl-utils
|
28 |
vertexai
|
29 |
# git+https://github.com/huggingface/transformers
|
30 |
-
transformers==4.
|
31 |
together==1.3.0
|
32 |
-
spacy
|
|
|
|
27 |
qwen-vl-utils
|
28 |
vertexai
|
29 |
# git+https://github.com/huggingface/transformers
|
30 |
+
transformers==4.51.3
|
31 |
together==1.3.0
|
32 |
+
spacy
|
33 |
+
vllm==0.8.5
|
src/backend/evaluate_model.py
CHANGED
@@ -29,9 +29,14 @@ class Evaluator:
|
|
29 |
summary_generator (SummaryGenerator): Instance for generating summaries.
|
30 |
eval_model (EvaluationModel): Instance for evaluating summaries.
|
31 |
"""
|
32 |
-
def __init__(
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
"""Initializes the Evaluator with the given model and settings.
|
36 |
|
37 |
Args:
|
@@ -47,6 +52,7 @@ class Evaluator:
|
|
47 |
output_base_path (str): Base path for output files.
|
48 |
"""
|
49 |
self.model = model
|
|
|
50 |
self.revision = revision
|
51 |
self.precision = precision
|
52 |
self.batch_size = batch_size
|
@@ -56,7 +62,7 @@ class Evaluator:
|
|
56 |
self.write_out = write_out
|
57 |
self.output_base_path = output_base_path
|
58 |
try:
|
59 |
-
self.summary_generator = SummaryGenerator(model, revision, self.device)
|
60 |
self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
|
61 |
except Exception as e:
|
62 |
logging.error(f"Error initializing Evaluator: {e}")
|
@@ -71,26 +77,35 @@ class Evaluator:
|
|
71 |
dict: A dictionary containing evaluation results.
|
72 |
"""
|
73 |
try:
|
|
|
|
|
|
|
|
|
|
|
74 |
df = pd.read_csv(envs.DATASET_PATH)
|
75 |
-
self.
|
|
|
|
|
76 |
|
77 |
avg_summary_len = self.summary_generator.avg_length
|
78 |
answer_rate = self.summary_generator.answer_rate
|
79 |
|
80 |
-
self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
|
81 |
-
self.generated_summaries_df)
|
82 |
factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
|
83 |
hallucination_rate = self.eval_model.hallucination_rate
|
84 |
|
85 |
-
results = util.format_results(
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
return results
|
92 |
except FileNotFoundError:
|
93 |
-
logging.error(f"File not found: {envs.
|
94 |
raise
|
95 |
except Exception as e:
|
96 |
logging.error(f"Error during evaluation: {e}")
|
|
|
29 |
summary_generator (SummaryGenerator): Instance for generating summaries.
|
30 |
eval_model (EvaluationModel): Instance for evaluating summaries.
|
31 |
"""
|
32 |
+
def __init__(
|
33 |
+
self, model, revision, precision, batch_size,
|
34 |
+
device, no_cache, limit, write_out=True,
|
35 |
+
output_base_path='logs',
|
36 |
+
model_path=None,
|
37 |
+
use_vllm=False,
|
38 |
+
tensor_parallel_size=1
|
39 |
+
):
|
40 |
"""Initializes the Evaluator with the given model and settings.
|
41 |
|
42 |
Args:
|
|
|
52 |
output_base_path (str): Base path for output files.
|
53 |
"""
|
54 |
self.model = model
|
55 |
+
self.model_path = model_path
|
56 |
self.revision = revision
|
57 |
self.precision = precision
|
58 |
self.batch_size = batch_size
|
|
|
62 |
self.write_out = write_out
|
63 |
self.output_base_path = output_base_path
|
64 |
try:
|
65 |
+
self.summary_generator = SummaryGenerator(model, revision, self.device, model_path=self.model_path, use_vllm=use_vllm, tensor_parallel_size=tensor_parallel_size)
|
66 |
self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
|
67 |
except Exception as e:
|
68 |
logging.error(f"Error initializing Evaluator: {e}")
|
|
|
77 |
dict: A dictionary containing evaluation results.
|
78 |
"""
|
79 |
try:
|
80 |
+
# print(envs.DATA_LEADERBOARD_REPO)
|
81 |
+
# snapshot_download(
|
82 |
+
# repo_id=envs.DATA_LEADERBOARD_REPO, local_dir=envs.DATA_LEADERBOARD_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
83 |
+
# )
|
84 |
+
|
85 |
df = pd.read_csv(envs.DATASET_PATH)
|
86 |
+
if self.limit is not None and self.limit > 0:
|
87 |
+
df = df.head(self.limit)
|
88 |
+
self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv", batch_size=self.batch_size)
|
89 |
|
90 |
avg_summary_len = self.summary_generator.avg_length
|
91 |
answer_rate = self.summary_generator.answer_rate
|
92 |
|
93 |
+
self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(self.generated_summaries_df)
|
|
|
94 |
factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
|
95 |
hallucination_rate = self.eval_model.hallucination_rate
|
96 |
|
97 |
+
results = util.format_results(
|
98 |
+
model_name=self.model,
|
99 |
+
revision=self.revision,
|
100 |
+
precision=self.precision,
|
101 |
+
factual_consistency_rate=factual_consistency_rate,
|
102 |
+
hallucination_rate=hallucination_rate,
|
103 |
+
answer_rate=answer_rate,
|
104 |
+
avg_summary_len=avg_summary_len
|
105 |
+
)
|
106 |
return results
|
107 |
except FileNotFoundError:
|
108 |
+
logging.error(f"File not found: {envs.DATA_LEADERBOARD_NAME}")
|
109 |
raise
|
110 |
except Exception as e:
|
111 |
logging.error(f"Error during evaluation: {e}")
|
src/backend/manage_requests.py
CHANGED
@@ -24,6 +24,7 @@ class EvalRequest:
|
|
24 |
likes: Optional[int] = 0
|
25 |
params: Optional[int] = None
|
26 |
license: Optional[str] = ""
|
|
|
27 |
|
28 |
def get_model_args(self):
|
29 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
@@ -36,8 +37,7 @@ class EvalRequest:
|
|
36 |
return model_args
|
37 |
|
38 |
|
39 |
-
def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
|
40 |
-
hf_repo: str, local_dir: str):
|
41 |
"""Updates a given eval request with its new status on the hub (running, completed, failed,)"""
|
42 |
json_filepath = eval_request.json_filepath
|
43 |
|
@@ -65,8 +65,7 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
65 |
Returns:
|
66 |
list[EvalRequest]: a list of model info dicts.
|
67 |
"""
|
68 |
-
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
|
69 |
-
repo_type="dataset", max_workers=60)
|
70 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
71 |
|
72 |
eval_requests = []
|
|
|
24 |
likes: Optional[int] = 0
|
25 |
params: Optional[int] = None
|
26 |
license: Optional[str] = ""
|
27 |
+
model_path: Optional[str] = None
|
28 |
|
29 |
def get_model_args(self):
|
30 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
|
|
37 |
return model_args
|
38 |
|
39 |
|
40 |
+
def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str, hf_repo: str, local_dir: str):
|
|
|
41 |
"""Updates a given eval request with its new status on the hub (running, completed, failed,)"""
|
42 |
json_filepath = eval_request.json_filepath
|
43 |
|
|
|
65 |
Returns:
|
66 |
list[EvalRequest]: a list of model info dicts.
|
67 |
"""
|
68 |
+
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
|
|
|
69 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
70 |
|
71 |
eval_requests = []
|
src/backend/model_operations.py
CHANGED
@@ -11,7 +11,7 @@ import pandas as pd
|
|
11 |
import spacy
|
12 |
import litellm
|
13 |
from tqdm import tqdm
|
14 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,
|
15 |
from peft import PeftModel
|
16 |
import torch
|
17 |
import cohere
|
@@ -19,11 +19,12 @@ from openai import OpenAI
|
|
19 |
from together import Together
|
20 |
import anthropic
|
21 |
import replicate
|
22 |
-
# import google.generativeai as genai
|
23 |
import vertexai
|
24 |
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
|
25 |
from mistralai import Mistral
|
26 |
from qwen_vl_utils import process_vision_info
|
|
|
|
|
27 |
|
28 |
|
29 |
import src.backend.util as util
|
@@ -32,8 +33,7 @@ import src.envs as envs
|
|
32 |
litellm.set_verbose=True
|
33 |
|
34 |
# Set up basic configuration for logging
|
35 |
-
logging.basicConfig(level=logging.INFO,
|
36 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
37 |
|
38 |
# Load spacy model for word tokenization
|
39 |
nlp = spacy.load("en_core_web_sm")
|
@@ -66,7 +66,7 @@ class SummaryGenerator:
|
|
66 |
answer_rate (float): Rate of non-empty summaries.
|
67 |
"""
|
68 |
|
69 |
-
def __init__(self, model_id, revision, device):
|
70 |
"""
|
71 |
Initializes the SummaryGenerator with a model.
|
72 |
|
@@ -76,6 +76,7 @@ class SummaryGenerator:
|
|
76 |
"""
|
77 |
self.model_id = model_id
|
78 |
self.model = f"huggingface/{model_id}"
|
|
|
79 |
self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
|
80 |
self.summaries_df = pd.DataFrame()
|
81 |
self.revision = revision
|
@@ -86,7 +87,10 @@ class SummaryGenerator:
|
|
86 |
self.local_model = None
|
87 |
self.local_pipeline = None
|
88 |
|
89 |
-
|
|
|
|
|
|
|
90 |
"""Generate summaries for a given DataFrame of source docs.
|
91 |
|
92 |
Args:
|
@@ -113,7 +117,6 @@ class SummaryGenerator:
|
|
113 |
while not _summary:
|
114 |
try:
|
115 |
_summary = self.generate_summary(system_prompt, user_prompt)
|
116 |
-
# print(f"Finish index {index}")
|
117 |
break
|
118 |
except Exception as e:
|
119 |
if 'Rate limit reached' in str(e):
|
@@ -142,8 +145,7 @@ class SummaryGenerator:
|
|
142 |
# Sleep to prevent hitting rate limits too frequently
|
143 |
time.sleep(1)
|
144 |
|
145 |
-
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
146 |
-
columns=["source", "summary", "dataset"])
|
147 |
|
148 |
if save_path is not None:
|
149 |
print(f'Save summaries to {save_path}')
|
@@ -164,7 +166,7 @@ class SummaryGenerator:
|
|
164 |
using_replicate_api = False
|
165 |
replicate_api_models = ['snowflake', 'llama-3.1-405b']
|
166 |
using_pipeline = False
|
167 |
-
pipeline_models = ['llama-3.1', 'phi-3-mini','falcon-7b', 'phi-3.5', 'mistral-nemo', 'llama-3.3']
|
168 |
|
169 |
for replicate_api_model in replicate_api_models:
|
170 |
if replicate_api_model in self.model_id.lower():
|
@@ -405,40 +407,32 @@ class SummaryGenerator:
|
|
405 |
trust_remote_code=True
|
406 |
)
|
407 |
else:
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
else:
|
412 |
-
self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
|
413 |
-
print("Tokenizer loaded")
|
414 |
-
if 'jamba' in self.model_id.lower():
|
415 |
-
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id,
|
416 |
-
torch_dtype=torch.bfloat16,
|
417 |
-
attn_implementation="flash_attention_2",
|
418 |
-
device_map="auto",
|
419 |
-
use_mamba_kernels=False)
|
420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
elif 'qwen2-vl' in self.model_id.lower():
|
422 |
self.local_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
423 |
self.model_id, torch_dtype="auto", device_map="auto"
|
424 |
)
|
425 |
self.processor = AutoProcessor.from_pretrained(self.model_id)
|
426 |
-
|
427 |
-
# elif 'ragamuffin' in self.model_id.lower():
|
428 |
-
# print('Using ragamuffin')
|
429 |
-
# self.local_model = AutoModelForCausalLM.from_pretrained(os.path.join('/home/miaoran', self.model_id),
|
430 |
-
# torch_dtype=torch.bfloat16, # forcing bfloat16 for now
|
431 |
-
# attn_implementation="flash_attention_2")
|
432 |
elif 'olmo' in self.model_id.lower():
|
433 |
-
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id)
|
434 |
-
|
435 |
elif 'qwq-' in self.model_id.lower():
|
436 |
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype="auto", device_map="auto")
|
437 |
-
|
438 |
else:
|
439 |
-
|
440 |
-
|
441 |
-
|
|
|
442 |
|
443 |
# Using local model/pipeline
|
444 |
if self.local_pipeline:
|
@@ -502,7 +496,10 @@ class SummaryGenerator:
|
|
502 |
elif 'qwq' in self.model_id.lower():
|
503 |
input_ids = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
504 |
else:
|
505 |
-
|
|
|
|
|
|
|
506 |
|
507 |
# Generate outputs
|
508 |
if 'granite' in self.model_id.lower():
|
@@ -513,14 +510,18 @@ class SummaryGenerator:
|
|
513 |
elif 'qwq' in self.model_id.lower():
|
514 |
outputs = self.local_model.generate(**input_ids, max_new_tokens=512, do_sample=True, temperature=0.01)
|
515 |
else:
|
516 |
-
|
517 |
-
|
|
|
|
|
|
|
|
|
|
|
518 |
if 'glm' in self.model_id.lower() or 'ragamuffin' in self.model_id.lower() or 'granite' in self.model_id.lower():
|
519 |
outputs = outputs[:, input_ids['input_ids'].shape[1]:]
|
520 |
elif 'qwen2-vl' in self.model_id.lower() or 'qwen2.5' in self.model_id.lower() or 'qwq-' in self.model_id.lower():
|
521 |
-
|
522 |
-
out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids.input_ids, outputs)
|
523 |
-
]
|
524 |
|
525 |
# Decode outputs
|
526 |
if 'qwen2-vl' in self.model_id.lower():
|
@@ -530,7 +531,10 @@ class SummaryGenerator:
|
|
530 |
elif 'olmo' in self.model_id.lower() or 'qwq' in self.model_id.lower():
|
531 |
result = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
532 |
else:
|
533 |
-
|
|
|
|
|
|
|
534 |
|
535 |
if 'gemma-2' in self.model_id.lower():
|
536 |
result = result.split(user_prompt + '\nmodel')[-1].strip()
|
@@ -593,13 +597,14 @@ class EvaluationModel:
|
|
593 |
Args:
|
594 |
model_path (str): Path to the CrossEncoder model.
|
595 |
"""
|
596 |
-
|
597 |
-
self.model = AutoModelForTokenClassification.from_pretrained(model_path, config=config)
|
598 |
self.device = device
|
599 |
self.model.to(self.device)
|
600 |
self.scores = []
|
601 |
self.factual_consistency_rate = None
|
602 |
self.hallucination_rate = None
|
|
|
|
|
603 |
|
604 |
def predict(self, text_pairs):
|
605 |
"""Load LoRA adapters of HHEM and make predictions
|
@@ -609,20 +614,9 @@ class EvaluationModel:
|
|
609 |
checkpoint: model ID on Hugging Face
|
610 |
"""
|
611 |
|
612 |
-
prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
|
613 |
-
|
614 |
-
tokenizer = AutoTokenizer.from_pretrained('t5-base')
|
615 |
-
inputs = tokenizer(
|
616 |
-
[prompt.format(text1=pair[0], text2=pair[1]) for pair in text_pairs],
|
617 |
-
return_tensors='pt', padding='longest').to(self.device)
|
618 |
-
|
619 |
-
self.model.eval()
|
620 |
with torch.no_grad():
|
621 |
-
output = self.model(
|
622 |
-
|
623 |
-
logits = logits[:,0,:] # get the logits on the first token
|
624 |
-
logits = torch.softmax(logits, dim=-1)
|
625 |
-
scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
|
626 |
return scores
|
627 |
|
628 |
def evaluate_hallucination(self, summaries_df):
|
|
|
11 |
import spacy
|
12 |
import litellm
|
13 |
from tqdm import tqdm
|
14 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSequenceClassification, AutoConfig, Qwen2VLForConditionalGeneration, AutoProcessor
|
15 |
from peft import PeftModel
|
16 |
import torch
|
17 |
import cohere
|
|
|
19 |
from together import Together
|
20 |
import anthropic
|
21 |
import replicate
|
|
|
22 |
import vertexai
|
23 |
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
|
24 |
from mistralai import Mistral
|
25 |
from qwen_vl_utils import process_vision_info
|
26 |
+
from vllm import LLM
|
27 |
+
from vllm import SamplingParams
|
28 |
|
29 |
|
30 |
import src.backend.util as util
|
|
|
33 |
litellm.set_verbose=True
|
34 |
|
35 |
# Set up basic configuration for logging
|
36 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
37 |
|
38 |
# Load spacy model for word tokenization
|
39 |
nlp = spacy.load("en_core_web_sm")
|
|
|
66 |
answer_rate (float): Rate of non-empty summaries.
|
67 |
"""
|
68 |
|
69 |
+
def __init__(self, model_id, revision, device, model_path=None, use_vllm=False, tensor_parallel_size=1):
|
70 |
"""
|
71 |
Initializes the SummaryGenerator with a model.
|
72 |
|
|
|
76 |
"""
|
77 |
self.model_id = model_id
|
78 |
self.model = f"huggingface/{model_id}"
|
79 |
+
self.model_path = model_path
|
80 |
self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
|
81 |
self.summaries_df = pd.DataFrame()
|
82 |
self.revision = revision
|
|
|
87 |
self.local_model = None
|
88 |
self.local_pipeline = None
|
89 |
|
90 |
+
self.use_vllm = use_vllm
|
91 |
+
self.tensor_parallel_size = tensor_parallel_size
|
92 |
+
|
93 |
+
def generate_summaries(self, df, save_path=None, batch_size=1):
|
94 |
"""Generate summaries for a given DataFrame of source docs.
|
95 |
|
96 |
Args:
|
|
|
117 |
while not _summary:
|
118 |
try:
|
119 |
_summary = self.generate_summary(system_prompt, user_prompt)
|
|
|
120 |
break
|
121 |
except Exception as e:
|
122 |
if 'Rate limit reached' in str(e):
|
|
|
145 |
# Sleep to prevent hitting rate limits too frequently
|
146 |
time.sleep(1)
|
147 |
|
148 |
+
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"])
|
|
|
149 |
|
150 |
if save_path is not None:
|
151 |
print(f'Save summaries to {save_path}')
|
|
|
166 |
using_replicate_api = False
|
167 |
replicate_api_models = ['snowflake', 'llama-3.1-405b']
|
168 |
using_pipeline = False
|
169 |
+
pipeline_models = ['llama-3.1', 'phi-3-mini','falcon-7b', 'phi-3.5', 'mistral-nemo', 'llama-3.3', 'phi-4']
|
170 |
|
171 |
for replicate_api_model in replicate_api_models:
|
172 |
if replicate_api_model in self.model_id.lower():
|
|
|
407 |
trust_remote_code=True
|
408 |
)
|
409 |
else:
|
410 |
+
print(f"loading tokenizer from {self.model_path or self.model_id}")
|
411 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path or self.model_id, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
|
413 |
+
print(f"loading model from {self.model_path or self.model_id}")
|
414 |
+
if 'jamba' in self.model_id.lower():
|
415 |
+
self.local_model = AutoModelForCausalLM.from_pretrained(
|
416 |
+
self.model_id,
|
417 |
+
torch_dtype=torch.bfloat16,
|
418 |
+
attn_implementation="flash_attention_2",
|
419 |
+
device_map="auto",
|
420 |
+
use_mamba_kernels=False
|
421 |
+
)
|
422 |
elif 'qwen2-vl' in self.model_id.lower():
|
423 |
self.local_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
424 |
self.model_id, torch_dtype="auto", device_map="auto"
|
425 |
)
|
426 |
self.processor = AutoProcessor.from_pretrained(self.model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
elif 'olmo' in self.model_id.lower():
|
428 |
+
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id)
|
|
|
429 |
elif 'qwq-' in self.model_id.lower():
|
430 |
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype="auto", device_map="auto")
|
|
|
431 |
else:
|
432 |
+
if self.use_vllm:
|
433 |
+
self.local_model = LLM(self.model_path or self.model_id, tensor_parallel_size=self.tensor_parallel_size, trust_remote_code=True)
|
434 |
+
else:
|
435 |
+
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_path or self.model_id, trust_remote_code=True, device_map="auto")
|
436 |
|
437 |
# Using local model/pipeline
|
438 |
if self.local_pipeline:
|
|
|
496 |
elif 'qwq' in self.model_id.lower():
|
497 |
input_ids = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
498 |
else:
|
499 |
+
if self.use_vllm:
|
500 |
+
input_ids = [prompt]
|
501 |
+
else:
|
502 |
+
input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
503 |
|
504 |
# Generate outputs
|
505 |
if 'granite' in self.model_id.lower():
|
|
|
510 |
elif 'qwq' in self.model_id.lower():
|
511 |
outputs = self.local_model.generate(**input_ids, max_new_tokens=512, do_sample=True, temperature=0.01)
|
512 |
else:
|
513 |
+
if self.use_vllm:
|
514 |
+
sampling_params = SamplingParams(temperature=0.01, max_tokens=250)
|
515 |
+
outputs = self.local_model.generate(input_ids, sampling_params, use_tqdm=False)
|
516 |
+
else:
|
517 |
+
with torch.no_grad():
|
518 |
+
outputs = self.local_model.generate(**input_ids, do_sample=True, max_new_tokens=250, temperature=0.01)#, pad_token_id=self.tokenizer.eos_token_id
|
519 |
+
|
520 |
if 'glm' in self.model_id.lower() or 'ragamuffin' in self.model_id.lower() or 'granite' in self.model_id.lower():
|
521 |
outputs = outputs[:, input_ids['input_ids'].shape[1]:]
|
522 |
elif 'qwen2-vl' in self.model_id.lower() or 'qwen2.5' in self.model_id.lower() or 'qwq-' in self.model_id.lower():
|
523 |
+
if not self.use_vllm:
|
524 |
+
outputs = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids.input_ids, outputs)]
|
|
|
525 |
|
526 |
# Decode outputs
|
527 |
if 'qwen2-vl' in self.model_id.lower():
|
|
|
531 |
elif 'olmo' in self.model_id.lower() or 'qwq' in self.model_id.lower():
|
532 |
result = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
533 |
else:
|
534 |
+
if self.use_vllm:
|
535 |
+
result = outputs[0].outputs[0].text
|
536 |
+
else:
|
537 |
+
result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
538 |
|
539 |
if 'gemma-2' in self.model_id.lower():
|
540 |
result = result.split(user_prompt + '\nmodel')[-1].strip()
|
|
|
597 |
Args:
|
598 |
model_path (str): Path to the CrossEncoder model.
|
599 |
"""
|
600 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(model_path, trust_remote_code=True)
|
|
|
601 |
self.device = device
|
602 |
self.model.to(self.device)
|
603 |
self.scores = []
|
604 |
self.factual_consistency_rate = None
|
605 |
self.hallucination_rate = None
|
606 |
+
|
607 |
+
self.model.eval()
|
608 |
|
609 |
def predict(self, text_pairs):
|
610 |
"""Load LoRA adapters of HHEM and make predictions
|
|
|
614 |
checkpoint: model ID on Hugging Face
|
615 |
"""
|
616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
with torch.no_grad():
|
618 |
+
output = self.model.predict(text_pairs)
|
619 |
+
scores = output.tolist()
|
|
|
|
|
|
|
620 |
return scores
|
621 |
|
622 |
def evaluate_hallucination(self, summaries_df):
|
src/backend/run_eval_suite.py
CHANGED
@@ -8,14 +8,15 @@ from src.backend.manage_requests import EvalRequest
|
|
8 |
from src.backend.evaluate_model import Evaluator
|
9 |
|
10 |
# Configure logging
|
11 |
-
logging.basicConfig(level=logging.INFO,
|
12 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
13 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
14 |
|
15 |
|
16 |
-
def run_evaluation(
|
17 |
-
|
18 |
-
|
|
|
|
|
19 |
"""
|
20 |
Run the evaluation for a given model and upload the results.
|
21 |
|
@@ -32,21 +33,20 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
|
|
32 |
Returns:
|
33 |
dict: A dictionary containing evaluation results.
|
34 |
"""
|
35 |
-
if limit:
|
36 |
logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
37 |
|
38 |
output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
|
39 |
-
# if os.path.exists(output_folder):
|
40 |
-
# f_name = os.listdir(output_folder)[-1]
|
41 |
-
# print(f"Loading results from {os.path.join(output_folder, f_name)}")
|
42 |
-
# results = json.loads(os.path.join(output_folder, f_name))
|
43 |
-
# dumped = json.dumps(results, indent=2)
|
44 |
-
# logging.info(dumped)
|
45 |
-
# else:
|
46 |
try:
|
47 |
-
evaluator = Evaluator(
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
results = evaluator.evaluate()
|
51 |
if write_results:
|
52 |
evaluator.write_results()
|
@@ -67,8 +67,7 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
|
|
67 |
dumped = json.dumps(results, indent=2)
|
68 |
logging.info(dumped)
|
69 |
|
70 |
-
output_path = os.path.join(output_folder,
|
71 |
-
f"results_{datetime.now()}.json") #
|
72 |
os.makedirs(output_folder, exist_ok=True)
|
73 |
with open(output_path, "w") as f:
|
74 |
f.write(dumped)
|
|
|
8 |
from src.backend.evaluate_model import Evaluator
|
9 |
|
10 |
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
|
14 |
|
15 |
+
def run_evaluation(
|
16 |
+
eval_request: EvalRequest, batch_size, device,
|
17 |
+
local_dir: str, results_repo: str, no_cache=True, limit=None,
|
18 |
+
need_check=True, write_results=False, use_vllm=False, tensor_parallel_size=1,
|
19 |
+
):
|
20 |
"""
|
21 |
Run the evaluation for a given model and upload the results.
|
22 |
|
|
|
33 |
Returns:
|
34 |
dict: A dictionary containing evaluation results.
|
35 |
"""
|
36 |
+
if limit is not None and limit > 0:
|
37 |
logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
38 |
|
39 |
output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
try:
|
41 |
+
evaluator = Evaluator(
|
42 |
+
eval_request.model, eval_request.revision, eval_request.precision,
|
43 |
+
batch_size, device, no_cache, limit, write_out=True,
|
44 |
+
output_base_path='logs',
|
45 |
+
model_path=eval_request.model_path,
|
46 |
+
use_vllm=use_vllm,
|
47 |
+
tensor_parallel_size=tensor_parallel_size
|
48 |
+
)
|
49 |
+
|
50 |
results = evaluator.evaluate()
|
51 |
if write_results:
|
52 |
evaluator.write_results()
|
|
|
67 |
dumped = json.dumps(results, indent=2)
|
68 |
logging.info(dumped)
|
69 |
|
70 |
+
output_path = os.path.join(output_folder, f"results_{datetime.now()}.json") #
|
|
|
71 |
os.makedirs(output_folder, exist_ok=True)
|
72 |
with open(output_path, "w") as f:
|
73 |
f.write(dumped)
|
src/display/about.py
CHANGED
@@ -10,12 +10,10 @@ class Task:
|
|
10 |
|
11 |
class Tasks(Enum):
|
12 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
-
hallucination_rate = Task("hallucination_rate",
|
14 |
-
"hallucination_rate", "Hallucination Rate (%)")
|
15 |
factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
|
16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
|
17 |
-
average_summary_length = Task("average_summary_length",
|
18 |
-
"average_summary_length", "Average Summary Length")
|
19 |
|
20 |
|
21 |
# Your leaderboard name
|
|
|
10 |
|
11 |
class Tasks(Enum):
|
12 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
+
hallucination_rate = Task("hallucination_rate", "hallucination_rate", "Hallucination Rate (%)")
|
|
|
14 |
factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
|
15 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
|
16 |
+
average_summary_length = Task("average_summary_length", "average_summary_length", "Average Summary Length")
|
|
|
17 |
|
18 |
|
19 |
# Your leaderboard name
|
src/display/utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
@@ -21,13 +21,19 @@ class ColumnContent:
|
|
21 |
never_hidden: bool = False
|
22 |
dummy: bool = False
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
## Leaderboard columns
|
25 |
auto_eval_column_dict = []
|
26 |
# Init
|
27 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
|
28 |
-
|
29 |
-
auto_eval_column_dict.append(["model", ColumnContent,
|
30 |
-
ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
|
|
21 |
never_hidden: bool = False
|
22 |
dummy: bool = False
|
23 |
|
24 |
+
def __hash__(self) -> int:
|
25 |
+
import time
|
26 |
+
import random
|
27 |
+
seed = hash(self.name) + hash(self.type) + hash(self.displayed_by_default) + \
|
28 |
+
hash(self.hidden) + hash(self.never_hidden) + hash(self.dummy) + \
|
29 |
+
hash(time.time()) + random.randint(0, 10000)
|
30 |
+
return seed
|
31 |
+
|
32 |
## Leaderboard columns
|
33 |
auto_eval_column_dict = []
|
34 |
# Init
|
35 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
36 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
|
37 |
for task in Tasks:
|
38 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
39 |
|
src/envs.py
CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
|
|
6 |
# replace this with our token
|
7 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
|
9 |
-
OWNER = "
|
10 |
REPO_ID = f"{OWNER}/leaderboard"
|
11 |
QUEUE_REPO = f"{OWNER}/requests"
|
12 |
RESULTS_REPO = f"{OWNER}/results"
|
@@ -20,13 +20,18 @@ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
20 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
21 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
22 |
|
|
|
|
|
|
|
|
|
23 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
|
24 |
API = HfApi(token=TOKEN)
|
25 |
|
26 |
LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
|
27 |
DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
|
28 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
29 |
-
HEM_PATH = 'vectara/HHEM-2.1'
|
|
|
30 |
|
31 |
SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
|
32 |
USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
|
|
|
6 |
# replace this with our token
|
7 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
|
9 |
+
OWNER = "airlsyn"
|
10 |
REPO_ID = f"{OWNER}/leaderboard"
|
11 |
QUEUE_REPO = f"{OWNER}/requests"
|
12 |
RESULTS_REPO = f"{OWNER}/results"
|
|
|
20 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
21 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
22 |
|
23 |
+
DATA_LEADERBOARD_REPO = f"{OWNER}/leaderboard_dataset"
|
24 |
+
DATA_LEADERBOARD_PATH = os.path.join(CACHE_PATH, "leaderboard-bk")
|
25 |
+
DATA_LEADERBOARD_NAME = os.path.join(DATA_LEADERBOARD_PATH, "leaderboard_dataset_16k.csv")
|
26 |
+
|
27 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
|
28 |
API = HfApi(token=TOKEN)
|
29 |
|
30 |
LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
|
31 |
DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
|
32 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
33 |
+
# HEM_PATH = 'vectara/HHEM-2.1'
|
34 |
+
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
35 |
|
36 |
SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
|
37 |
USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
|
src/leaderboard/read_evals.py
CHANGED
@@ -104,7 +104,7 @@ class EvalResult:
|
|
104 |
|
105 |
data_dict = {
|
106 |
"eval_name": self.eval_name, # not a column, just a save name,
|
107 |
-
#
|
108 |
utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
|
109 |
utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
110 |
utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
@@ -114,7 +114,7 @@ class EvalResult:
|
|
114 |
# utils.AutoEvalColumn.revision.name: self.revision,
|
115 |
# utils.AutoEvalColumn.license.name: self.license,
|
116 |
# utils.AutoEvalColumn.likes.name: self.likes,
|
117 |
-
#
|
118 |
# utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
119 |
}
|
120 |
|
@@ -172,8 +172,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
172 |
# Store results of same eval together
|
173 |
eval_name = eval_result.eval_name
|
174 |
if eval_name in eval_results.keys():
|
175 |
-
eval_results[eval_name].results.update({k: v for k, v in
|
176 |
-
eval_result.results.items() if v is not None})
|
177 |
else:
|
178 |
eval_results[eval_name] = eval_result
|
179 |
|
|
|
104 |
|
105 |
data_dict = {
|
106 |
"eval_name": self.eval_name, # not a column, just a save name,
|
107 |
+
# utils.AutoEvalColumn.precision.name: self.precision.value.name,
|
108 |
utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
|
109 |
utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
110 |
utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
|
|
114 |
# utils.AutoEvalColumn.revision.name: self.revision,
|
115 |
# utils.AutoEvalColumn.license.name: self.license,
|
116 |
# utils.AutoEvalColumn.likes.name: self.likes,
|
117 |
+
# utils.AutoEvalColumn.params.name: self.num_params,
|
118 |
# utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
119 |
}
|
120 |
|
|
|
172 |
# Store results of same eval together
|
173 |
eval_name = eval_result.eval_name
|
174 |
if eval_name in eval_results.keys():
|
175 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
|
|
176 |
else:
|
177 |
eval_results[eval_name] = eval_result
|
178 |
|